[llvm] e39f6c1 - [opt] Infer DataLayout from triple if not specified

Thu Oct 26 12:12:25 PDT 2023

Author: Alex Richardson
Date: 2023-10-26T12:07:37-07:00
New Revision: e39f6c1844fab59c638d8059a6cf139adb42279a

URL: https://github.com/llvm/llvm-project/commit/e39f6c1844fab59c638d8059a6cf139adb42279a
DIFF: https://github.com/llvm/llvm-project/commit/e39f6c1844fab59c638d8059a6cf139adb42279a.diff

LOG: [opt] Infer DataLayout from triple if not specified

There are many tests that specify a target triple/CPU flags but no
DataLayout which can lead to IR being generated that has unusual
behaviour. This commit attempts to use the default DataLayout based
on the relevant flags if there is no explicit override on the command
line or in the IR file.

One thing that is not currently possible to differentiate from a missing
datalayout `target datalayout = ""` in the IR file since the current
APIs don't allow detecting this case. If it is considered useful to
support this case (instead of passing "-data-layout=" on the command
line), I can change IR parsers to track whether they have seen such a
directive and change the callback type.

Differential Revision: https://reviews.llvm.org/D141060

Added: 
    llvm/test/tools/opt/infer-data-layout.ll
    llvm/test/tools/opt/invalid-target.ll

Modified: 
    llvm/test/Analysis/BasicAA/libfuncs.ll
    llvm/test/Analysis/CostModel/AArch64/sve-fptrunc.ll
    llvm/test/Analysis/CostModel/AArch64/sve-trunc.ll
    llvm/test/Analysis/CostModel/AMDGPU/cast.ll
    llvm/test/Analysis/CostModel/AMDGPU/load-to-trunc.ll
    llvm/test/Analysis/CostModel/ARM/load-to-trunc.ll
    llvm/test/Analysis/CostModel/PowerPC/load-to-trunc.ll
    llvm/test/Analysis/CostModel/RISCV/cast.ll
    llvm/test/Analysis/CostModel/RISCV/fca-load-store.ll
    llvm/test/Analysis/CostModel/RISCV/load-to-trunc.ll
    llvm/test/Analysis/CostModel/RISCV/rvv-load-store.ll
    llvm/test/Analysis/CostModel/SystemZ/load-to-trunc.ll
    llvm/test/Analysis/CostModel/X86/load-to-trunc.ll
    llvm/test/Analysis/CostModel/X86/min-legal-vector-width.ll
    llvm/test/Analysis/CostModel/X86/size-cost.ll
    llvm/test/Analysis/CostModel/X86/trunc-codesize.ll
    llvm/test/Analysis/CostModel/X86/trunc-latency.ll
    llvm/test/Analysis/CostModel/X86/trunc-sizelatency.ll
    llvm/test/Analysis/CostModel/X86/trunc.ll
    llvm/test/CodeGen/AArch64/arm64_32-gep-sink.ll
    llvm/test/CodeGen/AArch64/sme-aarch64-svcount-O3.ll
    llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
    llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
    llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.defined.nobuiltin.ll
    llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
    llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll
    llvm/test/CodeGen/AMDGPU/lds-reject-absolute-addresses.ll
    llvm/test/CodeGen/AMDGPU/lower-ctor-dtor-constexpr-alias.ll
    llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll
    llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll
    llvm/test/CodeGen/AMDGPU/lower-kernel-lds-super-align.ll
    llvm/test/CodeGen/AMDGPU/lower-kernel-lds.ll
    llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
    llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll
    llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll
    llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect-extern-uses-max-reachable-alignment.ll
    llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll
    llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
    llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
    llvm/test/CodeGen/AMDGPU/lower-multiple-ctor-dtor.ll
    llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
    llvm/test/CodeGen/AMDGPU/opencl-printf.ll
    llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
    llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
    llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll
    llvm/test/CodeGen/AMDGPU/rewrite-out-arguments.ll
    llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1-bpfeb.ll
    llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1.ll
    llvm/test/CodeGen/X86/expand-large-div-rem-sdiv129.ll
    llvm/test/CodeGen/X86/expand-large-div-rem-srem129.ll
    llvm/test/CodeGen/X86/expand-large-div-rem-udiv129.ll
    llvm/test/CodeGen/X86/expand-large-div-rem-urem129.ll
    llvm/test/Instrumentation/HWAddressSanitizer/use-after-scope.ll
    llvm/test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_32.ll
    llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll
    llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
    llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll
    llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll
    llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fsub.ll
    llvm/test/Transforms/AtomicExpand/PowerPC/cmpxchg.ll
    llvm/test/Transforms/AtomicExpand/X86/expand-atomic-libcall.ll
    llvm/test/Transforms/CodeGenPrepare/AMDGPU/sink-addrspacecast.ll
    llvm/test/Transforms/DivRemPairs/X86/div-expanded-rem-pair.ll
    llvm/test/Transforms/IndVarSimplify/ARM/indvar-cost.ll
    llvm/test/Transforms/InferFunctionAttrs/annotate.ll
    llvm/test/Transforms/InstCombine/double-float-shrink-2.ll
    llvm/test/Transforms/InstCombine/ffs-i16.ll
    llvm/test/Transforms/InstCombine/fls-i16.ll
    llvm/test/Transforms/InstCombine/isascii-i16.ll
    llvm/test/Transforms/InstCombine/isdigit-i16.ll
    llvm/test/Transforms/InstCombine/pow-4.ll
    llvm/test/Transforms/InstCombine/pow_fp_int.ll
    llvm/test/Transforms/InstCombine/printf-i16.ll
    llvm/test/Transforms/InstCombine/puts-i16.ll
    llvm/test/Transforms/InstCombine/sincospi.ll
    llvm/test/Transforms/InstSimplify/ConstProp/calls-math-finite.ll
    llvm/test/Transforms/LoopStrengthReduce/X86/2012-01-13-phielim.ll
    llvm/test/Transforms/LoopUnroll/AArch64/runtime-unroll-generic.ll
    llvm/test/Transforms/LoopUnroll/ARM/instr-size-costs.ll
    llvm/test/Transforms/LoopUnroll/ARM/upperbound.ll
    llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
    llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll
    llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll
    llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll
    llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll
    llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll
    llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-call.ll
    llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses-zve32x.ll
    llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
    llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll
    llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
    llvm/test/Transforms/MemCpyOpt/no-libcalls.ll
    llvm/test/Transforms/MergeICmps/X86/addressspaces.ll
    llvm/test/Transforms/NewGVN/refine-stores.ll
    llvm/test/Transforms/OpenMP/barrier_removal.ll
    llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll
    llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll
    llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/reduce-add-i64.ll
    llvm/test/Transforms/SLPVectorizer/AArch64/splat-loads.ll
    llvm/test/Transforms/SLPVectorizer/X86/control-dependence.ll
    llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll
    llvm/test/Transforms/SafeStack/X86/setjmp2.ll
    llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn.ll
    llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep.ll
    llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll
    llvm/test/Transforms/SeparateConstOffsetFromGEP/RISCV/split-gep.ll
    llvm/test/Transforms/TypePromotion/ARM/pointers.ll
    llvm/test/Transforms/VectorCombine/X86/insert-binop-with-constant-inseltpoison.ll
    llvm/test/Transforms/VectorCombine/X86/insert-binop-with-constant.ll
    llvm/test/Transforms/VectorCombine/X86/scalarize-cmp-inseltpoison.ll
    llvm/test/Transforms/VectorCombine/X86/scalarize-cmp.ll
    llvm/tools/opt/opt.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/test/Analysis/BasicAA/libfuncs.ll b/llvm/test/Analysis/BasicAA/libfuncs.ll
index b584e5327be1f0a..e1ad376ad383bfe 100644

--- a/llvm/test/Analysis/BasicAA/libfuncs.ll
+++ b/llvm/test/Analysis/BasicAA/libfuncs.ll
@@ -1,4 +1,4 @@
-; RUN: opt -mtriple=i386-pc-linux-gnu -aa-pipeline=basic-aa -passes=inferattrs,aa-eval -print-all-alias-modref-info -disable-output 2>&1 %s | FileCheck %s
+; RUN: opt -mtriple=x86_64-pc-linux-gnu -aa-pipeline=basic-aa -passes=inferattrs,aa-eval -print-all-alias-modref-info -disable-output 2>&1 %s | FileCheck %s
 
 ; CHECK-LABEL: Function: test_memcmp_const_size
 ; CHECK:      Just Ref:  Ptr: i8* %a	<->  %res = tail call i32 @memcmp(ptr %a, ptr %b, i64 4)

diff  --git a/llvm/test/Analysis/CostModel/AArch64/sve-fptrunc.ll b/llvm/test/Analysis/CostModel/AArch64/sve-fptrunc.ll
index c322fad94e969a0..020d34deb8c69cb 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-fptrunc.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-fptrunc.ll
@@ -1,19 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 3
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple aarch64-linux-gnu -mattr=+sve -S -o - < %s | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-unknown-linux-gnu"
 
 define void @sve_fptruncs() {
-  ;CHECK-LABEL: 'sve_fptruncs'
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %nxv2_f16_from_f32 = fptrunc <vscale x 2 x float> undef to <vscale x 2 x half>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %nxv4_f16_from_f32 = fptrunc <vscale x 4 x float> undef to <vscale x 4 x half>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction:   %nxv8_f16_from_f32 = fptrunc <vscale x 8 x float> undef to <vscale x 8 x half>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %nxv2_f16_from_f64 = fptrunc <vscale x 2 x double> undef to <vscale x 2 x half>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction:   %nxv4_f16_from_f64 = fptrunc <vscale x 4 x double> undef to <vscale x 4 x half>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction:   %nxv8_f16_from_f64 = fptrunc <vscale x 8 x double> undef to <vscale x 8 x half>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %nxv2_f32_from_f64 = fptrunc <vscale x 2 x double> undef to <vscale x 2 x float>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction:   %nxv4_f32_from_f64 = fptrunc <vscale x 4 x double> undef to <vscale x 4 x float>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction:   %nxv8_f32_from_f64 = fptrunc <vscale x 8 x double> undef to <vscale x 8 x float>
+; CHECK-LABEL: 'sve_fptruncs'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2_f16_from_f32 = fptrunc <vscale x 2 x float> undef to <vscale x 2 x half>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4_f16_from_f32 = fptrunc <vscale x 4 x float> undef to <vscale x 4 x half>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8_f16_from_f32 = fptrunc <vscale x 8 x float> undef to <vscale x 8 x half>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2_f16_from_f64 = fptrunc <vscale x 2 x double> undef to <vscale x 2 x half>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4_f16_from_f64 = fptrunc <vscale x 4 x double> undef to <vscale x 4 x half>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv8_f16_from_f64 = fptrunc <vscale x 8 x double> undef to <vscale x 8 x half>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2_f32_from_f64 = fptrunc <vscale x 2 x double> undef to <vscale x 2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4_f32_from_f64 = fptrunc <vscale x 4 x double> undef to <vscale x 4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8_f32_from_f64 = fptrunc <vscale x 8 x double> undef to <vscale x 8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
   %nxv2_f16_from_f32 = fptrunc <vscale x 2 x float> undef to <vscale x 2 x half>
   %nxv4_f16_from_f32 = fptrunc <vscale x 4 x float> undef to <vscale x 4 x half>
   %nxv8_f16_from_f32 = fptrunc <vscale x 8 x float> undef to <vscale x 8 x half>

diff  --git a/llvm/test/Analysis/CostModel/AArch64/sve-trunc.ll b/llvm/test/Analysis/CostModel/AArch64/sve-trunc.ll
index c9d03296e85bdba..767bd5f5b75cb84 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-trunc.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-trunc.ll
@@ -1,24 +1,27 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 3
 ; RUN: opt -mtriple=aarch64-linux-gnu -mattr=+sve -passes="print<cost-model>" 2>&1 -disable-output < %s | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define void @sve_truncs() {
-  ;CHECK-LABEL: 'sve_truncs'
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %trunc_v2i16_to_i1 = trunc <vscale x 2 x i16> undef to <vscale x 2 x i1>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %trunc_v2i32_to_i1 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i1>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %trunc_v2i64_to_i1 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i1>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %trunc_v4i16_to_i1 = trunc <vscale x 4 x i16> undef to <vscale x 4 x i1>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %trunc_v4i32_to_i1 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i1>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction:   %trunc_v4i64_to_i1 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i1>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %trunc_v8i16_to_i1 = trunc <vscale x 8 x i16> undef to <vscale x 8 x i1>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction:   %trunc_v8i32_to_i1 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i1>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction:   %trunc_v8i64_to_i1 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i1>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %trunc_v2i32_to_i16 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i16>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %trunc_v2i64_to_i32 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i32>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %trunc_v4i32_to_i16 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i16>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction:   %trunc_v4i64_to_i32 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i32>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction:   %trunc_v8i32_to_i16 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i16>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction:   %trunc_v8i64_to_i32 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i32>
+; CHECK-LABEL: 'sve_truncs'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %trunc_v2i16_to_i1 = trunc <vscale x 2 x i16> undef to <vscale x 2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %trunc_v2i32_to_i1 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %trunc_v2i64_to_i1 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %trunc_v4i16_to_i1 = trunc <vscale x 4 x i16> undef to <vscale x 4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %trunc_v4i32_to_i1 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %trunc_v4i64_to_i1 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %trunc_v8i16_to_i1 = trunc <vscale x 8 x i16> undef to <vscale x 8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %trunc_v8i32_to_i1 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %trunc_v8i64_to_i1 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %trunc_v2i32_to_i16 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %trunc_v2i64_to_i32 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %trunc_v4i32_to_i16 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %trunc_v4i64_to_i32 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %trunc_v8i32_to_i16 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %trunc_v8i64_to_i32 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
   %trunc_v2i16_to_i1  = trunc <vscale x 2 x i16> undef to <vscale x 2 x i1>
   %trunc_v2i32_to_i1  = trunc <vscale x 2 x i32> undef to <vscale x 2 x i1>
   %trunc_v2i64_to_i1  = trunc <vscale x 2 x i64> undef to <vscale x 2 x i1>

diff  --git a/llvm/test/Analysis/CostModel/AMDGPU/cast.ll b/llvm/test/Analysis/CostModel/AMDGPU/cast.ll
index 55757e9b57957cf..fc8f5fc393396c6 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/cast.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/cast.ll
@@ -74,11 +74,11 @@ define i32 @zext_sext(<8 x i1> %in) {
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %E = trunc <4 x i64> undef to <4 x i32>
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %F = trunc <8 x i32> undef to <8 x i16>
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %F1 = trunc <16 x i16> undef to <16 x i8>
-; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %F2 = trunc <8 x i32> undef to <8 x i8>
-; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %F3 = trunc <4 x i64> undef to <4 x i8>
+; FAST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %F2 = trunc <8 x i32> undef to <8 x i8>
+; FAST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %F3 = trunc <4 x i64> undef to <4 x i8>
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %G = trunc <8 x i64> undef to <8 x i32>
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %G1 = trunc <16 x i32> undef to <16 x i16>
-; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %G2 = trunc <16 x i32> undef to <16 x i8>
+; FAST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %G2 = trunc <16 x i32> undef to <16 x i8>
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
 ;
 ; SLOW-LABEL: 'zext_sext'
@@ -134,11 +134,11 @@ define i32 @zext_sext(<8 x i1> %in) {
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %E = trunc <4 x i64> undef to <4 x i32>
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %F = trunc <8 x i32> undef to <8 x i16>
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %F1 = trunc <16 x i16> undef to <16 x i8>
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %F2 = trunc <8 x i32> undef to <8 x i8>
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %F3 = trunc <4 x i64> undef to <4 x i8>
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %F2 = trunc <8 x i32> undef to <8 x i8>
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %F3 = trunc <4 x i64> undef to <4 x i8>
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %G = trunc <8 x i64> undef to <8 x i32>
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %G1 = trunc <16 x i32> undef to <16 x i16>
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %G2 = trunc <16 x i32> undef to <16 x i8>
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %G2 = trunc <16 x i32> undef to <16 x i8>
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SLOW-SIZE-LABEL: 'zext_sext'

diff  --git a/llvm/test/Analysis/CostModel/AMDGPU/load-to-trunc.ll b/llvm/test/Analysis/CostModel/AMDGPU/load-to-trunc.ll
index 0c16e3f095be5c1..d304acc79f5ed4b 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/load-to-trunc.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/load-to-trunc.ll
@@ -8,7 +8,7 @@
 ; Check that cost is 1 for unusual load to register sized load.
 define i32 @loadUnusualIntegerWithTrunc(ptr %ptr) {
 ; CHECK-LABEL: 'loadUnusualIntegerWithTrunc'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 8
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %trunc = trunc i128 %out to i32
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %trunc
 ;
@@ -19,7 +19,7 @@ define i32 @loadUnusualIntegerWithTrunc(ptr %ptr) {
 
 define i128 @loadUnusualInteger(ptr %ptr) {
 ; CHECK-LABEL: 'loadUnusualInteger'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %out = load i128, ptr %ptr, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %out = load i128, ptr %ptr, align 8
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i128 %out
 ;
   %out = load i128, ptr %ptr

diff  --git a/llvm/test/Analysis/CostModel/ARM/load-to-trunc.ll b/llvm/test/Analysis/CostModel/ARM/load-to-trunc.ll
index ba776852396da23..44042094fb06baf 100644
--- a/llvm/test/Analysis/CostModel/ARM/load-to-trunc.ll
+++ b/llvm/test/Analysis/CostModel/ARM/load-to-trunc.ll
@@ -9,8 +9,8 @@
 ; Check that cost is 1 for unusual load to register sized load.
 define i32 @loadUnusualIntegerWithTrunc(ptr %ptr) {
 ; CHECK-LABEL: 'loadUnusualIntegerWithTrunc'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %trunc = trunc i128 %out to i32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %trunc = trunc i128 %out to i32
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %trunc
 ;
   %out = load i128, ptr %ptr
@@ -20,7 +20,7 @@ define i32 @loadUnusualIntegerWithTrunc(ptr %ptr) {
 
 define i128 @loadUnusualInteger(ptr %ptr) {
 ; CHECK-LABEL: 'loadUnusualInteger'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 8
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i128 %out
 ;
   %out = load i128, ptr %ptr

diff  --git a/llvm/test/Analysis/CostModel/PowerPC/load-to-trunc.ll b/llvm/test/Analysis/CostModel/PowerPC/load-to-trunc.ll
index 6297f60e0108fed..57a6e98cfb4ee63 100644
--- a/llvm/test/Analysis/CostModel/PowerPC/load-to-trunc.ll
+++ b/llvm/test/Analysis/CostModel/PowerPC/load-to-trunc.ll
@@ -7,7 +7,7 @@
 ; Check that cost is 1 for unusual load to register sized load.
 define i32 @loadUnusualIntegerWithTrunc(ptr %ptr) {
 ; CHECK-LABEL: 'loadUnusualIntegerWithTrunc'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 8
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %trunc = trunc i128 %out to i32
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %trunc
 ;
@@ -18,7 +18,7 @@ define i32 @loadUnusualIntegerWithTrunc(ptr %ptr) {
 
 define i128 @loadUnusualInteger(ptr %ptr) {
 ; CHECK-LABEL: 'loadUnusualInteger'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %out = load i128, ptr %ptr, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %out = load i128, ptr %ptr, align 8
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i128 %out
 ;
   %out = load i128, ptr %ptr

diff  --git a/llvm/test/Analysis/CostModel/RISCV/cast.ll b/llvm/test/Analysis/CostModel/RISCV/cast.ll
index c486af1cfd4a80f..bd26c19c2f2c3c9 100644
--- a/llvm/test/Analysis/CostModel/RISCV/cast.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/cast.ll
@@ -1075,7 +1075,7 @@ define void @trunc() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_v64i1 = trunc <64 x i8> undef to <64 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i16_v64i1 = trunc <64 x i16> undef to <64 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v64i32_v64i1 = trunc <64 x i32> undef to <64 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64i64_v64i1 = trunc <64 x i64> undef to <64 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i64_v64i1 = trunc <64 x i64> undef to <64 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v128i16_v128i8 = trunc <128 x i16> undef to <128 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v128i32_v128i8 = trunc <128 x i32> undef to <128 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v128i64_v128i8 = trunc <128 x i64> undef to <128 x i8>
@@ -1085,7 +1085,7 @@ define void @trunc() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i8_v128i1 = trunc <128 x i8> undef to <128 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v128i16_v128i1 = trunc <128 x i16> undef to <128 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v128i32_v128i1 = trunc <128 x i32> undef to <128 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v128i64_v128i1 = trunc <128 x i64> undef to <128 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v128i64_v128i1 = trunc <128 x i64> undef to <128 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v256i16_v256i8 = trunc <256 x i16> undef to <256 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v256i32_v256i8 = trunc <256 x i32> undef to <256 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %v256i64_v256i8 = trunc <256 x i64> undef to <256 x i8>
@@ -1095,7 +1095,7 @@ define void @trunc() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v256i8_v256i1 = trunc <256 x i8> undef to <256 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v256i16_v256i1 = trunc <256 x i16> undef to <256 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v256i32_v256i1 = trunc <256 x i32> undef to <256 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %v256i64_v256i1 = trunc <256 x i64> undef to <256 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v256i64_v256i1 = trunc <256 x i64> undef to <256 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i8 = trunc <vscale x 1 x i16> undef to <vscale x 1 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i32_nxv1i8 = trunc <vscale x 1 x i32> undef to <vscale x 1 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i64_nxv1i8 = trunc <vscale x 1 x i64> undef to <vscale x 1 x i8>
@@ -1227,8 +1227,8 @@ define void @trunc() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v64i64_v64i32 = trunc <64 x i64> undef to <64 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_v64i1 = trunc <64 x i8> undef to <64 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i16_v64i1 = trunc <64 x i16> undef to <64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v64i32_v64i1 = trunc <64 x i32> undef to <64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64i64_v64i1 = trunc <64 x i64> undef to <64 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v64i32_v64i1 = trunc <64 x i32> undef to <64 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v64i64_v64i1 = trunc <64 x i64> undef to <64 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v128i16_v128i8 = trunc <128 x i16> undef to <128 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v128i32_v128i8 = trunc <128 x i32> undef to <128 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v128i64_v128i8 = trunc <128 x i64> undef to <128 x i8>
@@ -1237,8 +1237,8 @@ define void @trunc() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v128i64_v128i32 = trunc <128 x i64> undef to <128 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i8_v128i1 = trunc <128 x i8> undef to <128 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v128i16_v128i1 = trunc <128 x i16> undef to <128 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v128i32_v128i1 = trunc <128 x i32> undef to <128 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v128i64_v128i1 = trunc <128 x i64> undef to <128 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v128i32_v128i1 = trunc <128 x i32> undef to <128 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v128i64_v128i1 = trunc <128 x i64> undef to <128 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v256i16_v256i8 = trunc <256 x i16> undef to <256 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v256i32_v256i8 = trunc <256 x i32> undef to <256 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %v256i64_v256i8 = trunc <256 x i64> undef to <256 x i8>
@@ -1247,8 +1247,8 @@ define void @trunc() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v256i64_v256i32 = trunc <256 x i64> undef to <256 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v256i8_v256i1 = trunc <256 x i8> undef to <256 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v256i16_v256i1 = trunc <256 x i16> undef to <256 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v256i32_v256i1 = trunc <256 x i32> undef to <256 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %v256i64_v256i1 = trunc <256 x i64> undef to <256 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v256i32_v256i1 = trunc <256 x i32> undef to <256 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v256i64_v256i1 = trunc <256 x i64> undef to <256 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i8 = trunc <vscale x 1 x i16> undef to <vscale x 1 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i32_nxv1i8 = trunc <vscale x 1 x i32> undef to <vscale x 1 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i64_nxv1i8 = trunc <vscale x 1 x i64> undef to <vscale x 1 x i8>

diff  --git a/llvm/test/Analysis/CostModel/RISCV/fca-load-store.ll b/llvm/test/Analysis/CostModel/RISCV/fca-load-store.ll
index 81299de2d496811..ca722a739a5e151 100644
--- a/llvm/test/Analysis/CostModel/RISCV/fca-load-store.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/fca-load-store.ll
@@ -4,10 +4,10 @@
 
 define void @load(ptr %p) {
 ; CHECK-LABEL: 'load'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %1 = load [2 x i64], ptr %p, align 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %2 = load [4 x i64], ptr %p, align 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %3 = load { i64, i64 }, ptr %p, align 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %4 = load { i64, i32 }, ptr %p, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %1 = load [2 x i64], ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %2 = load [4 x i64], ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %3 = load { i64, i64 }, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %4 = load { i64, i32 }, ptr %p, align 8
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   load [2 x i64], ptr %p
@@ -20,10 +20,10 @@ define void @load(ptr %p) {
 
 define void @store(ptr %p) {
 ; CHECK-LABEL: 'store'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store [2 x i64] undef, ptr %p, align 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store [4 x i64] undef, ptr %p, align 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store { i64, i64 } undef, ptr %p, align 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store { i64, i32 } undef, ptr %p, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store [2 x i64] undef, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store [4 x i64] undef, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store { i64, i64 } undef, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store { i64, i32 } undef, ptr %p, align 8
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   store [2 x i64] undef, ptr %p

diff  --git a/llvm/test/Analysis/CostModel/RISCV/load-to-trunc.ll b/llvm/test/Analysis/CostModel/RISCV/load-to-trunc.ll
index 189cda24327dbef..2e400c74985fd49 100644
--- a/llvm/test/Analysis/CostModel/RISCV/load-to-trunc.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/load-to-trunc.ll
@@ -8,8 +8,8 @@
 ; Check that cost is 1 for unusual load to register sized load.
 define i32 @loadUnusualIntegerWithTrunc(ptr %ptr) {
 ; CHECK-LABEL: 'loadUnusualIntegerWithTrunc'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %trunc = trunc i128 %out to i32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %trunc = trunc i128 %out to i32
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %trunc
 ;
   %out = load i128, ptr %ptr
@@ -19,7 +19,7 @@ define i32 @loadUnusualIntegerWithTrunc(ptr %ptr) {
 
 define i128 @loadUnusualInteger(ptr %ptr) {
 ; CHECK-LABEL: 'loadUnusualInteger'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %out = load i128, ptr %ptr, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %out = load i128, ptr %ptr, align 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i128 %out
 ;
   %out = load i128, ptr %ptr

diff  --git a/llvm/test/Analysis/CostModel/RISCV/rvv-load-store.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-load-store.ll
index df02b5374f77086..669ccf4a1b2632d 100644
--- a/llvm/test/Analysis/CostModel/RISCV/rvv-load-store.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/rvv-load-store.ll
@@ -44,7 +44,7 @@ define void @load(ptr %p) {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %37 = load <vscale x 8 x i32>, ptr %p, align 32
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %38 = load <vscale x 16 x i32>, ptr %p, align 64
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %39 = load <vscale x 32 x i32>, ptr %p, align 128
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %40 = load i64, ptr %p, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %40 = load i64, ptr %p, align 8
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %41 = load <1 x i64>, ptr %p, align 8
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %42 = load <2 x i64>, ptr %p, align 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %43 = load <4 x i64>, ptr %p, align 32
@@ -187,7 +187,7 @@ define void @store(ptr %p) {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <vscale x 8 x i32> undef, ptr %p, align 32
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <vscale x 16 x i32> undef, ptr %p, align 64
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <vscale x 32 x i32> undef, ptr %p, align 128
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i64 undef, ptr %p, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i64 undef, ptr %p, align 8
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <1 x i64> undef, ptr %p, align 8
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> undef, ptr %p, align 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <4 x i64> undef, ptr %p, align 32

diff  --git a/llvm/test/Analysis/CostModel/SystemZ/load-to-trunc.ll b/llvm/test/Analysis/CostModel/SystemZ/load-to-trunc.ll
index 2f643f156c1eeab..cd6af575ea9ec3d 100644
--- a/llvm/test/Analysis/CostModel/SystemZ/load-to-trunc.ll
+++ b/llvm/test/Analysis/CostModel/SystemZ/load-to-trunc.ll
@@ -8,7 +8,7 @@
 ; Check that cost is 1 for unusual load to register sized load.
 define i32 @loadUnusualIntegerWithTrunc(ptr %ptr) {
 ; CHECK-LABEL: 'loadUnusualIntegerWithTrunc'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 8
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %trunc = trunc i128 %out to i32
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %trunc
 ;
@@ -19,7 +19,7 @@ define i32 @loadUnusualIntegerWithTrunc(ptr %ptr) {
 
 define i128 @loadUnusualInteger(ptr %ptr) {
 ; CHECK-LABEL: 'loadUnusualInteger'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 8
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i128 %out
 ;
   %out = load i128, ptr %ptr

diff  --git a/llvm/test/Analysis/CostModel/X86/load-to-trunc.ll b/llvm/test/Analysis/CostModel/X86/load-to-trunc.ll
index 813e7a191259df3..8386f2b5e2b5d34 100644
--- a/llvm/test/Analysis/CostModel/X86/load-to-trunc.ll
+++ b/llvm/test/Analysis/CostModel/X86/load-to-trunc.ll
@@ -9,7 +9,7 @@
 ; Check that cost is 1 for unusual load to register sized load.
 define i32 @loadUnusualIntegerWithTrunc(ptr %ptr) {
 ; CHECK-LABEL: 'loadUnusualIntegerWithTrunc'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %trunc = trunc i128 %out to i32
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %trunc
 ;
@@ -20,7 +20,7 @@ define i32 @loadUnusualIntegerWithTrunc(ptr %ptr) {
 
 define i128 @loadUnusualInteger(ptr %ptr) {
 ; CHECK-LABEL: 'loadUnusualInteger'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i128 %out
 ;
   %out = load i128, ptr %ptr

diff  --git a/llvm/test/Analysis/CostModel/X86/min-legal-vector-width.ll b/llvm/test/Analysis/CostModel/X86/min-legal-vector-width.ll
index 414ffe927e5fc0b..47bbc53f3cd7848 100644
--- a/llvm/test/Analysis/CostModel/X86/min-legal-vector-width.ll
+++ b/llvm/test/Analysis/CostModel/X86/min-legal-vector-width.ll
@@ -542,20 +542,20 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i8 = trunc <4 x i8> undef to <4 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i8 = trunc <8 x i8> undef to <8 x i1>
@@ -569,8 +569,8 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1>
-; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
-; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
+; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1>
@@ -587,8 +587,8 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i8 = trunc <4 x i8> undef to <4 x i1>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8i8 = trunc <8 x i8> undef to <8 x i1>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i8 = trunc <16 x i8> undef to <16 x i1>
-; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
-; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
+; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
+; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512VL512-LABEL: 'trunc_vXi1'
@@ -596,8 +596,8 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1>
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1>
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1>
-; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
-; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
+; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1>
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1>
@@ -608,14 +608,14 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1>
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1>
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i1>
-; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
-; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
+; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
+; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i8 = trunc <4 x i8> undef to <4 x i1>
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8i8 = trunc <8 x i8> undef to <8 x i1>
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16i8 = trunc <16 x i8> undef to <16 x i1>
-; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
-; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
+; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
+; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SKX256-LABEL: 'trunc_vXi1'
@@ -623,14 +623,14 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; SKX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1>
 ; SKX256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1>
 ; SKX256-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1>
-; SKX256-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
-; SKX256-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; SKX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
+; SKX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
 ; SKX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; SKX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1>
 ; SKX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1>
 ; SKX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1>
-; SKX256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
-; SKX256-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
+; SKX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
+; SKX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
 ; SKX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
 ; SKX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1>
 ; SKX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1>
@@ -650,14 +650,14 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; SKX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1>
 ; SKX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1>
 ; SKX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1>
-; SKX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
-; SKX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; SKX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
+; SKX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
 ; SKX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; SKX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1>
 ; SKX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1>
 ; SKX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1>
-; SKX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
-; SKX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
+; SKX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
+; SKX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
 ; SKX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
 ; SKX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1>
 ; SKX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1>

diff  --git a/llvm/test/Analysis/CostModel/X86/size-cost.ll b/llvm/test/Analysis/CostModel/X86/size-cost.ll
index 41c16af96340cdb..aa905dcec777a6a 100644
--- a/llvm/test/Analysis/CostModel/X86/size-cost.ll
+++ b/llvm/test/Analysis/CostModel/X86/size-cost.ll
@@ -48,7 +48,7 @@ define double @bitcast_i64_f64(i64 %x) {
 
 define ptr @inttoptr_i64_p64(i64 %x) {
 ; CHECK-LABEL: 'inttoptr_i64_p64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = inttoptr i64 %x to ptr
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = inttoptr i64 %x to ptr
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret ptr %r
 ;
   %r = inttoptr i64 %x to ptr
@@ -57,7 +57,7 @@ define ptr @inttoptr_i64_p64(i64 %x) {
 
 define i64 @ptrtoint_p64_i64(ptr %x) {
 ; CHECK-LABEL: 'ptrtoint_p64_i64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = ptrtoint ptr %x to i64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = ptrtoint ptr %x to i64
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
 ;
   %r = ptrtoint ptr %x to i64

diff  --git a/llvm/test/Analysis/CostModel/X86/trunc-codesize.ll b/llvm/test/Analysis/CostModel/X86/trunc-codesize.ll
index 6419302da03f6c9..9c4abfef52d9164 100644
--- a/llvm/test/Analysis/CostModel/X86/trunc-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/trunc-codesize.ll
@@ -537,26 +537,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1>
@@ -572,26 +572,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1>
@@ -607,26 +607,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1>
@@ -663,292 +663,6 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V896i8 = trunc <896 x i8> undef to <896 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
-;
-; AVX512-LABEL: 'trunc_vXi1'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i1
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i64 = trunc <3 x i64> undef to <3 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i64 = trunc <5 x i64> undef to <5 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i64 = trunc <7 x i64> undef to <7 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i32 = trunc <5 x i32> undef to <5 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i32 = trunc <7 x i32> undef to <7 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i16 = trunc <5 x i16> undef to <5 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i16 = trunc <6 x i16> undef to <6 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i16 = trunc <7 x i16> undef to <7 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i16 = trunc <10 x i16> undef to <10 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i16 = trunc <12 x i16> undef to <12 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i16 = trunc <14 x i16> undef to <14 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i8 = trunc <4 x i8> undef to <4 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i8 = trunc <5 x i8> undef to <5 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i8 = trunc <6 x i8> undef to <6 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i8 = trunc <7 x i8> undef to <7 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i8 = trunc <8 x i8> undef to <8 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i8 = trunc <10 x i8> undef to <10 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i8 = trunc <12 x i8> undef to <12 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i8 = trunc <14 x i8> undef to <14 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i8 = trunc <16 x i8> undef to <16 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i8 = trunc <20 x i8> undef to <20 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i8 = trunc <24 x i8> undef to <24 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i8 = trunc <28 x i8> undef to <28 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i8 = trunc <40 x i8> undef to <40 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i8 = trunc <48 x i8> undef to <48 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i8 = trunc <56 x i8> undef to <56 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i8 = trunc <80 x i8> undef to <80 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i8 = trunc <112 x i8> undef to <112 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i8 = trunc <160 x i8> undef to <160 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i8 = trunc <192 x i8> undef to <192 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i8 = trunc <224 x i8> undef to <224 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i8 = trunc <320 x i8> undef to <320 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i8 = trunc <384 x i8> undef to <384 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i8 = trunc <448 x i8> undef to <448 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i8 = trunc <640 x i8> undef to <640 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i8 = trunc <768 x i8> undef to <768 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i8 = trunc <896 x i8> undef to <896 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
-;
-; AVX256-LABEL: 'trunc_vXi1'
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i1
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i64 = trunc <3 x i64> undef to <3 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i64 = trunc <5 x i64> undef to <5 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i64 = trunc <7 x i64> undef to <7 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i32 = trunc <5 x i32> undef to <5 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i32 = trunc <7 x i32> undef to <7 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i16 = trunc <5 x i16> undef to <5 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i16 = trunc <6 x i16> undef to <6 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i16 = trunc <7 x i16> undef to <7 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i16 = trunc <10 x i16> undef to <10 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i16 = trunc <12 x i16> undef to <12 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i16 = trunc <14 x i16> undef to <14 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i8 = trunc <4 x i8> undef to <4 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i8 = trunc <5 x i8> undef to <5 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i8 = trunc <6 x i8> undef to <6 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i8 = trunc <7 x i8> undef to <7 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i8 = trunc <8 x i8> undef to <8 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i8 = trunc <10 x i8> undef to <10 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i8 = trunc <12 x i8> undef to <12 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i8 = trunc <14 x i8> undef to <14 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i8 = trunc <16 x i8> undef to <16 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i8 = trunc <20 x i8> undef to <20 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i8 = trunc <24 x i8> undef to <24 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i8 = trunc <28 x i8> undef to <28 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i8 = trunc <40 x i8> undef to <40 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i8 = trunc <48 x i8> undef to <48 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i8 = trunc <56 x i8> undef to <56 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i8 = trunc <80 x i8> undef to <80 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i8 = trunc <112 x i8> undef to <112 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i8 = trunc <160 x i8> undef to <160 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i8 = trunc <192 x i8> undef to <192 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i8 = trunc <224 x i8> undef to <224 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i8 = trunc <320 x i8> undef to <320 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i8 = trunc <384 x i8> undef to <384 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i8 = trunc <448 x i8> undef to <448 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i8 = trunc <640 x i8> undef to <640 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i8 = trunc <768 x i8> undef to <768 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i8 = trunc <896 x i8> undef to <896 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %i64 = trunc i64 undef to i1
   %V2i64 = trunc <2 x i64> undef to <2 x i1>
@@ -1096,3 +810,6 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 
   ret i32 undef
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX256: {{.*}}
+; AVX512: {{.*}}

diff  --git a/llvm/test/Analysis/CostModel/X86/trunc-latency.ll b/llvm/test/Analysis/CostModel/X86/trunc-latency.ll
index 20789ad7bd50f6c..f6092b9f5bab10d 100644
--- a/llvm/test/Analysis/CostModel/X86/trunc-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/trunc-latency.ll
@@ -537,26 +537,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1>
@@ -572,26 +572,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1>
@@ -607,26 +607,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1>
@@ -663,292 +663,6 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V896i8 = trunc <896 x i8> undef to <896 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
-;
-; AVX512-LABEL: 'trunc_vXi1'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i1
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i64 = trunc <3 x i64> undef to <3 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i64 = trunc <5 x i64> undef to <5 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i64 = trunc <7 x i64> undef to <7 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i32 = trunc <5 x i32> undef to <5 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i32 = trunc <7 x i32> undef to <7 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i16 = trunc <5 x i16> undef to <5 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i16 = trunc <6 x i16> undef to <6 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i16 = trunc <7 x i16> undef to <7 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i16 = trunc <10 x i16> undef to <10 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i16 = trunc <12 x i16> undef to <12 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i16 = trunc <14 x i16> undef to <14 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i8 = trunc <4 x i8> undef to <4 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i8 = trunc <5 x i8> undef to <5 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i8 = trunc <6 x i8> undef to <6 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i8 = trunc <7 x i8> undef to <7 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i8 = trunc <8 x i8> undef to <8 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i8 = trunc <10 x i8> undef to <10 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i8 = trunc <12 x i8> undef to <12 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i8 = trunc <14 x i8> undef to <14 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i8 = trunc <16 x i8> undef to <16 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i8 = trunc <20 x i8> undef to <20 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i8 = trunc <24 x i8> undef to <24 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i8 = trunc <28 x i8> undef to <28 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i8 = trunc <40 x i8> undef to <40 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i8 = trunc <48 x i8> undef to <48 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i8 = trunc <56 x i8> undef to <56 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i8 = trunc <80 x i8> undef to <80 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i8 = trunc <112 x i8> undef to <112 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i8 = trunc <160 x i8> undef to <160 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i8 = trunc <192 x i8> undef to <192 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i8 = trunc <224 x i8> undef to <224 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i8 = trunc <320 x i8> undef to <320 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i8 = trunc <384 x i8> undef to <384 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i8 = trunc <448 x i8> undef to <448 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i8 = trunc <640 x i8> undef to <640 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i8 = trunc <768 x i8> undef to <768 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i8 = trunc <896 x i8> undef to <896 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
-;
-; AVX256-LABEL: 'trunc_vXi1'
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i1
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i64 = trunc <3 x i64> undef to <3 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i64 = trunc <5 x i64> undef to <5 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i64 = trunc <7 x i64> undef to <7 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i32 = trunc <5 x i32> undef to <5 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i32 = trunc <7 x i32> undef to <7 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i16 = trunc <5 x i16> undef to <5 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i16 = trunc <6 x i16> undef to <6 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i16 = trunc <7 x i16> undef to <7 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i16 = trunc <10 x i16> undef to <10 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i16 = trunc <12 x i16> undef to <12 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i16 = trunc <14 x i16> undef to <14 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i8 = trunc <4 x i8> undef to <4 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i8 = trunc <5 x i8> undef to <5 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i8 = trunc <6 x i8> undef to <6 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i8 = trunc <7 x i8> undef to <7 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i8 = trunc <8 x i8> undef to <8 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i8 = trunc <10 x i8> undef to <10 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i8 = trunc <12 x i8> undef to <12 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i8 = trunc <14 x i8> undef to <14 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i8 = trunc <16 x i8> undef to <16 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i8 = trunc <20 x i8> undef to <20 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i8 = trunc <24 x i8> undef to <24 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i8 = trunc <28 x i8> undef to <28 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i8 = trunc <40 x i8> undef to <40 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i8 = trunc <48 x i8> undef to <48 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i8 = trunc <56 x i8> undef to <56 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i8 = trunc <80 x i8> undef to <80 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i8 = trunc <112 x i8> undef to <112 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i8 = trunc <160 x i8> undef to <160 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i8 = trunc <192 x i8> undef to <192 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i8 = trunc <224 x i8> undef to <224 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i8 = trunc <320 x i8> undef to <320 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i8 = trunc <384 x i8> undef to <384 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i8 = trunc <448 x i8> undef to <448 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i8 = trunc <640 x i8> undef to <640 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i8 = trunc <768 x i8> undef to <768 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i8 = trunc <896 x i8> undef to <896 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %i64 = trunc i64 undef to i1
   %V2i64 = trunc <2 x i64> undef to <2 x i1>
@@ -1096,3 +810,6 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 
   ret i32 undef
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX256: {{.*}}
+; AVX512: {{.*}}

diff  --git a/llvm/test/Analysis/CostModel/X86/trunc-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/trunc-sizelatency.ll
index a705c47849fdce8..49ed2902da45ba8 100644
--- a/llvm/test/Analysis/CostModel/X86/trunc-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/trunc-sizelatency.ll
@@ -537,26 +537,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1>
@@ -572,26 +572,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1>
@@ -607,26 +607,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1>
@@ -663,292 +663,6 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V896i8 = trunc <896 x i8> undef to <896 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
-;
-; AVX512-LABEL: 'trunc_vXi1'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i1
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i64 = trunc <3 x i64> undef to <3 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i64 = trunc <5 x i64> undef to <5 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i64 = trunc <7 x i64> undef to <7 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i32 = trunc <5 x i32> undef to <5 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i32 = trunc <7 x i32> undef to <7 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i16 = trunc <5 x i16> undef to <5 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i16 = trunc <6 x i16> undef to <6 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i16 = trunc <7 x i16> undef to <7 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i16 = trunc <10 x i16> undef to <10 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i16 = trunc <12 x i16> undef to <12 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i16 = trunc <14 x i16> undef to <14 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i8 = trunc <4 x i8> undef to <4 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i8 = trunc <5 x i8> undef to <5 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i8 = trunc <6 x i8> undef to <6 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i8 = trunc <7 x i8> undef to <7 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i8 = trunc <8 x i8> undef to <8 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i8 = trunc <10 x i8> undef to <10 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i8 = trunc <12 x i8> undef to <12 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i8 = trunc <14 x i8> undef to <14 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i8 = trunc <16 x i8> undef to <16 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i8 = trunc <20 x i8> undef to <20 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i8 = trunc <24 x i8> undef to <24 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i8 = trunc <28 x i8> undef to <28 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i8 = trunc <40 x i8> undef to <40 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i8 = trunc <48 x i8> undef to <48 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i8 = trunc <56 x i8> undef to <56 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i8 = trunc <80 x i8> undef to <80 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i8 = trunc <112 x i8> undef to <112 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i8 = trunc <160 x i8> undef to <160 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i8 = trunc <192 x i8> undef to <192 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i8 = trunc <224 x i8> undef to <224 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i8 = trunc <320 x i8> undef to <320 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i8 = trunc <384 x i8> undef to <384 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i8 = trunc <448 x i8> undef to <448 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i8 = trunc <640 x i8> undef to <640 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i8 = trunc <768 x i8> undef to <768 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i8 = trunc <896 x i8> undef to <896 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
-;
-; AVX256-LABEL: 'trunc_vXi1'
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i1
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i64 = trunc <3 x i64> undef to <3 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i64 = trunc <5 x i64> undef to <5 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i64 = trunc <7 x i64> undef to <7 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i32 = trunc <5 x i32> undef to <5 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i32 = trunc <7 x i32> undef to <7 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i16 = trunc <5 x i16> undef to <5 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i16 = trunc <6 x i16> undef to <6 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i16 = trunc <7 x i16> undef to <7 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i16 = trunc <10 x i16> undef to <10 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i16 = trunc <12 x i16> undef to <12 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i16 = trunc <14 x i16> undef to <14 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i8 = trunc <4 x i8> undef to <4 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i8 = trunc <5 x i8> undef to <5 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i8 = trunc <6 x i8> undef to <6 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i8 = trunc <7 x i8> undef to <7 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i8 = trunc <8 x i8> undef to <8 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i8 = trunc <10 x i8> undef to <10 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i8 = trunc <12 x i8> undef to <12 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i8 = trunc <14 x i8> undef to <14 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i8 = trunc <16 x i8> undef to <16 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i8 = trunc <20 x i8> undef to <20 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i8 = trunc <24 x i8> undef to <24 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i8 = trunc <28 x i8> undef to <28 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i8 = trunc <40 x i8> undef to <40 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i8 = trunc <48 x i8> undef to <48 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i8 = trunc <56 x i8> undef to <56 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i8 = trunc <80 x i8> undef to <80 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i8 = trunc <112 x i8> undef to <112 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i8 = trunc <160 x i8> undef to <160 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i8 = trunc <192 x i8> undef to <192 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i8 = trunc <224 x i8> undef to <224 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i8 = trunc <320 x i8> undef to <320 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i8 = trunc <384 x i8> undef to <384 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i8 = trunc <448 x i8> undef to <448 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i8 = trunc <640 x i8> undef to <640 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i8 = trunc <768 x i8> undef to <768 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i8 = trunc <896 x i8> undef to <896 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %i64 = trunc i64 undef to i1
   %V2i64 = trunc <2 x i64> undef to <2 x i1>
@@ -1096,3 +810,6 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 
   ret i32 undef
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX256: {{.*}}
+; AVX512: {{.*}}

diff  --git a/llvm/test/Analysis/CostModel/X86/trunc.ll b/llvm/test/Analysis/CostModel/X86/trunc.ll
index 98e6fa5348b9c4f..2d062742ef68543 100644
--- a/llvm/test/Analysis/CostModel/X86/trunc.ll
+++ b/llvm/test/Analysis/CostModel/X86/trunc.ll
@@ -2754,26 +2754,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 117 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 165 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 231 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 396 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 462 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 184 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 660 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 792 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 924 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 368 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1320 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1584 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1848 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 736 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1>
@@ -2789,26 +2789,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 117 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 165 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 231 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 396 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 462 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 660 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 792 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 924 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1320 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1584 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1848 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1>
@@ -2824,26 +2824,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 117 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 165 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 231 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 396 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 462 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 660 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 792 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 924 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1320 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1584 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1848 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1>
@@ -2897,26 +2897,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 117 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 165 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 231 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 396 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 462 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 184 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 660 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 792 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 924 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 368 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1320 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1584 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1848 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 736 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1>
@@ -2932,26 +2932,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 117 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 165 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 231 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 396 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 462 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 660 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 792 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 924 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 272 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1320 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1584 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1848 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 544 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1>
@@ -2967,26 +2967,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 117 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 165 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 231 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 396 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 462 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 660 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 792 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 924 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1320 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1584 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1848 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1>
@@ -3040,26 +3040,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 896 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1536 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1792 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 704 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1>
@@ -3110,26 +3110,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 896 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1536 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1792 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1>
@@ -3145,26 +3145,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V20i8 = trunc <20 x i8> undef to <20 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V24i8 = trunc <24 x i8> undef to <24 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V28i8 = trunc <28 x i8> undef to <28 x i1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V40i8 = trunc <40 x i8> undef to <40 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V48i8 = trunc <48 x i8> undef to <48 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V56i8 = trunc <56 x i8> undef to <56 x i1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V80i8 = trunc <80 x i8> undef to <80 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V112i8 = trunc <112 x i8> undef to <112 x i1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V160i8 = trunc <160 x i8> undef to <160 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V192i8 = trunc <192 x i8> undef to <192 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V224i8 = trunc <224 x i8> undef to <224 x i1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V320i8 = trunc <320 x i8> undef to <320 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %V384i8 = trunc <384 x i8> undef to <384 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 896 for instruction: %V448i8 = trunc <448 x i8> undef to <448 x i1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V640i8 = trunc <640 x i8> undef to <640 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1536 for instruction: %V768i8 = trunc <768 x i8> undef to <768 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1792 for instruction: %V896i8 = trunc <896 x i8> undef to <896 x i1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512FVEC256-LABEL: 'trunc_vXi1'
@@ -3183,26 +3183,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 896 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 1536 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 1792 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 704 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1>
@@ -3288,26 +3288,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V20i8 = trunc <20 x i8> undef to <20 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V24i8 = trunc <24 x i8> undef to <24 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V28i8 = trunc <28 x i8> undef to <28 x i1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V40i8 = trunc <40 x i8> undef to <40 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V48i8 = trunc <48 x i8> undef to <48 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V56i8 = trunc <56 x i8> undef to <56 x i1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V80i8 = trunc <80 x i8> undef to <80 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V112i8 = trunc <112 x i8> undef to <112 x i1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V160i8 = trunc <160 x i8> undef to <160 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V192i8 = trunc <192 x i8> undef to <192 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V224i8 = trunc <224 x i8> undef to <224 x i1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V320i8 = trunc <320 x i8> undef to <320 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %V384i8 = trunc <384 x i8> undef to <384 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 896 for instruction: %V448i8 = trunc <448 x i8> undef to <448 x i1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 272 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V640i8 = trunc <640 x i8> undef to <640 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 1536 for instruction: %V768i8 = trunc <768 x i8> undef to <768 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 1792 for instruction: %V896i8 = trunc <896 x i8> undef to <896 x i1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 544 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQVEC512-LABEL: 'trunc_vXi1'
@@ -3396,26 +3396,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 896 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 1536 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 1792 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1>
@@ -3431,26 +3431,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V20i8 = trunc <20 x i8> undef to <20 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V24i8 = trunc <24 x i8> undef to <24 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V28i8 = trunc <28 x i8> undef to <28 x i1>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V40i8 = trunc <40 x i8> undef to <40 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V48i8 = trunc <48 x i8> undef to <48 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V56i8 = trunc <56 x i8> undef to <56 x i1>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V80i8 = trunc <80 x i8> undef to <80 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V112i8 = trunc <112 x i8> undef to <112 x i1>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V160i8 = trunc <160 x i8> undef to <160 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V192i8 = trunc <192 x i8> undef to <192 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V224i8 = trunc <224 x i8> undef to <224 x i1>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V320i8 = trunc <320 x i8> undef to <320 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %V384i8 = trunc <384 x i8> undef to <384 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 896 for instruction: %V448i8 = trunc <448 x i8> undef to <448 x i1>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V640i8 = trunc <640 x i8> undef to <640 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 1536 for instruction: %V768i8 = trunc <768 x i8> undef to <768 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 1792 for instruction: %V896i8 = trunc <896 x i8> undef to <896 x i1>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQVEC256-LABEL: 'trunc_vXi1'
@@ -3574,26 +3574,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V20i8 = trunc <20 x i8> undef to <20 x i1>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V24i8 = trunc <24 x i8> undef to <24 x i1>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V28i8 = trunc <28 x i8> undef to <28 x i1>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V40i8 = trunc <40 x i8> undef to <40 x i1>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V48i8 = trunc <48 x i8> undef to <48 x i1>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V56i8 = trunc <56 x i8> undef to <56 x i1>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V80i8 = trunc <80 x i8> undef to <80 x i1>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V112i8 = trunc <112 x i8> undef to <112 x i1>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V160i8 = trunc <160 x i8> undef to <160 x i1>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V192i8 = trunc <192 x i8> undef to <192 x i1>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V224i8 = trunc <224 x i8> undef to <224 x i1>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V320i8 = trunc <320 x i8> undef to <320 x i1>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %V384i8 = trunc <384 x i8> undef to <384 x i1>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 896 for instruction: %V448i8 = trunc <448 x i8> undef to <448 x i1>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 272 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V640i8 = trunc <640 x i8> undef to <640 x i1>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 1536 for instruction: %V768i8 = trunc <768 x i8> undef to <768 x i1>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 1792 for instruction: %V896i8 = trunc <896 x i8> undef to <896 x i1>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 544 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BWVEC512-LABEL: 'trunc_vXi1'
@@ -3612,26 +3612,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 188 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 896 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 376 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 1536 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 1792 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 752 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1>
@@ -3647,26 +3647,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 896 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 1536 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 1792 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1>
@@ -3755,26 +3755,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 188 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 896 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 376 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 1536 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 1792 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 752 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1>
@@ -3790,26 +3790,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 896 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 152 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 1536 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 1792 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 304 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1>
@@ -3898,26 +3898,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 117 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 165 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 231 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 396 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 462 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 184 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 660 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 792 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 924 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 368 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1320 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1584 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1848 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 736 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1>
@@ -3933,26 +3933,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 117 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 165 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 231 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 396 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 462 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 660 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 792 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 924 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1320 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1584 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1848 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1>
@@ -3968,26 +3968,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 117 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 165 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 231 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 396 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 462 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 660 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 792 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 924 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1320 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1584 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1848 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1>

diff  --git a/llvm/test/CodeGen/AArch64/arm64_32-gep-sink.ll b/llvm/test/CodeGen/AArch64/arm64_32-gep-sink.ll
index 1a9c5974a547a81..964c669439d4da5 100644
--- a/llvm/test/CodeGen/AArch64/arm64_32-gep-sink.ll
+++ b/llvm/test/CodeGen/AArch64/arm64_32-gep-sink.ll
@@ -8,8 +8,9 @@ define void @test_simple_sink(ptr %base, i64 %offset) {
 ; CHECK-NEXT:    [[TST:%.*]] = load i1, ptr [[ADDR]], align 1
 ; CHECK-NEXT:    br i1 [[TST]], label [[NEXT:%.*]], label [[END:%.*]]
 ; CHECK:       next:
-; CHECK-NEXT:    [[SUNKADDR:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[OFFSET]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load volatile i1, ptr [[SUNKADDR]], align 1
+; CHECK-NEXT:    [[SUNKADDR:%.*]] = trunc i64 [[OFFSET]] to i32
+; CHECK-NEXT:    [[SUNKADDR1:%.*]] = getelementptr i8, ptr [[BASE]], i32 [[SUNKADDR]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load volatile i1, ptr [[SUNKADDR1]], align 1
 ; CHECK-NEXT:    ret void
 ; CHECK:       end:
 ; CHECK-NEXT:    ret void
@@ -33,8 +34,9 @@ define void @test_inbounds_sink(ptr %base, i64 %offset) {
 ; CHECK-NEXT:    [[TST:%.*]] = load i1, ptr [[ADDR]], align 1
 ; CHECK-NEXT:    br i1 [[TST]], label [[NEXT:%.*]], label [[END:%.*]]
 ; CHECK:       next:
-; CHECK-NEXT:    [[SUNKADDR:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[OFFSET]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load volatile i1, ptr [[SUNKADDR]], align 1
+; CHECK-NEXT:    [[SUNKADDR:%.*]] = trunc i64 [[OFFSET]] to i32
+; CHECK-NEXT:    [[SUNKADDR1:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i32 [[SUNKADDR]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load volatile i1, ptr [[SUNKADDR1]], align 1
 ; CHECK-NEXT:    ret void
 ; CHECK:       end:
 ; CHECK-NEXT:    ret void
@@ -61,8 +63,8 @@ define void @test_add_sink(ptr %base, i64 %offset) {
 ; CHECK-NEXT:    [[TST:%.*]] = load i1, ptr [[ADDR]], align 1
 ; CHECK-NEXT:    br i1 [[TST]], label [[NEXT:%.*]], label [[END:%.*]]
 ; CHECK:       next:
-; CHECK-NEXT:    [[SUNKADDR:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[OFFSET]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load volatile i1, ptr [[SUNKADDR]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[ADDR64]] to ptr
+; CHECK-NEXT:    [[TMP2:%.*]] = load volatile i1, ptr [[TMP1]], align 1
 ; CHECK-NEXT:    ret void
 ; CHECK:       end:
 ; CHECK-NEXT:    ret void

diff  --git a/llvm/test/CodeGen/AArch64/sme-aarch64-svcount-O3.ll b/llvm/test/CodeGen/AArch64/sme-aarch64-svcount-O3.ll
index 531c666277aae79..079f2e7eb481bf6 100644
--- a/llvm/test/CodeGen/AArch64/sme-aarch64-svcount-O3.ll
+++ b/llvm/test/CodeGen/AArch64/sme-aarch64-svcount-O3.ll
@@ -5,14 +5,14 @@
 define target("aarch64.svcount") @test_alloca_store_reload(target("aarch64.svcount") %val0, target("aarch64.svcount") %val1, ptr %iptr, ptr %pptr, i64 %N) nounwind {
 ; CHECK-LABEL: @test_alloca_store_reload(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    store i64 0, ptr [[IPTR:%.*]], align 4
+; CHECK-NEXT:    store i64 0, ptr [[IPTR:%.*]], align 8
 ; CHECK-NEXT:    store target("aarch64.svcount") [[VAL0:%.*]], ptr [[PPTR:%.*]], align 2
 ; CHECK-NEXT:    [[I1_PEEL:%.*]] = icmp eq i64 [[N:%.*]], 0
 ; CHECK-NEXT:    br i1 [[I1_PEEL]], label [[LOOP_EXIT:%.*]], label [[LOOP_BODY:%.*]]
 ; CHECK:       loop.body:
 ; CHECK-NEXT:    [[IND:%.*]] = phi i64 [ [[IND_NEXT:%.*]], [[LOOP_BODY]] ], [ 1, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[IPTR_GEP:%.*]] = getelementptr i64, ptr [[IPTR]], i64 [[IND]]
-; CHECK-NEXT:    store i64 [[IND]], ptr [[IPTR_GEP]], align 4
+; CHECK-NEXT:    store i64 [[IND]], ptr [[IPTR_GEP]], align 8
 ; CHECK-NEXT:    store target("aarch64.svcount") [[VAL1:%.*]], ptr [[PPTR]], align 2
 ; CHECK-NEXT:    [[IND_NEXT]] = add i64 [[IND]], 1
 ; CHECK-NEXT:    [[I1:%.*]] = icmp eq i64 [[IND]], [[N]]

diff  --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
index aacaae2753c7591..c4e40d46e3179b0 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
@@ -183,12 +183,12 @@ define amdgpu_kernel void @store_value_constant_cast_lds_gv_gep_to_flat(ptr addr
 define amdgpu_kernel void @store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat(ptr addrspace(1) %out) #1 {
 ; AKF_HSA-LABEL: define {{[^@]+}}@store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat
 ; AKF_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT:    store i64 ptrtoint (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to i64), ptr addrspace(1) [[OUT]], align 4
+; AKF_HSA-NEXT:    store i64 ptrtoint (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to i64), ptr addrspace(1) [[OUT]], align 8
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat
 ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR2]] {
-; ATTRIBUTOR_HSA-NEXT:    store i64 ptrtoint (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to i64), ptr addrspace(1) [[OUT]], align 4
+; ATTRIBUTOR_HSA-NEXT:    store i64 ptrtoint (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to i64), ptr addrspace(1) [[OUT]], align 8
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
   store i64 ptrtoint (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to i64), ptr addrspace(1) %out

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index 7eb1cb926c190de..d342c4ffa37b01c 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -69,7 +69,6 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -164,7 +163,6 @@ define amdgpu_kernel void @urem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -280,7 +278,6 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -398,7 +395,6 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -480,7 +476,6 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
 ; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
 ; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
@@ -551,7 +546,6 @@ define amdgpu_kernel void @urem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
@@ -630,7 +624,6 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s4, v2
 ; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -715,7 +708,6 @@ define amdgpu_kernel void @srem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -788,7 +780,6 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
 ; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
 ; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
@@ -856,7 +847,6 @@ define amdgpu_kernel void @urem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
@@ -934,7 +924,6 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s4, v2
 ; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -1020,7 +1009,6 @@ define amdgpu_kernel void @srem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -1283,7 +1271,6 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_v4i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
@@ -1591,7 +1578,6 @@ define amdgpu_kernel void @urem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_v4i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
@@ -1983,7 +1969,6 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
 ; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s0, v3
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_v4i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
@@ -2391,7 +2376,6 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_v4i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
@@ -2657,7 +2641,6 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_v4i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
@@ -2879,7 +2862,6 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_v4i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
@@ -3129,7 +3111,6 @@ define amdgpu_kernel void @sdiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_v4i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
@@ -3411,7 +3392,6 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_v4i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
@@ -3544,7 +3524,6 @@ define amdgpu_kernel void @udiv_i3(ptr addrspace(1) %out, i3 %x, i3 %y) {
 ; GFX6-NEXT:    v_and_b32_e32 v0, 7, v0
 ; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_i3:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -3618,7 +3597,6 @@ define amdgpu_kernel void @urem_i3(ptr addrspace(1) %out, i3 %x, i3 %y) {
 ; GFX6-NEXT:    v_and_b32_e32 v0, 7, v0
 ; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_i3:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
@@ -3700,7 +3678,6 @@ define amdgpu_kernel void @sdiv_i3(ptr addrspace(1) %out, i3 %x, i3 %y) {
 ; GFX6-NEXT:    v_and_b32_e32 v0, 7, v0
 ; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_i3:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -3788,7 +3765,6 @@ define amdgpu_kernel void @srem_i3(ptr addrspace(1) %out, i3 %x, i3 %y) {
 ; GFX6-NEXT:    v_and_b32_e32 v0, 7, v0
 ; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_i3:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -3934,7 +3910,6 @@ define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ; GFX6-NEXT:    buffer_store_short v2, off, s[0:3], 0 offset:4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_v3i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
@@ -4107,7 +4082,6 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ; GFX6-NEXT:    buffer_store_short v2, off, s[0:3], 0 offset:4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_v3i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
@@ -4302,7 +4276,6 @@ define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
 ; GFX6-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_v3i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
@@ -4519,7 +4492,6 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ; GFX6-NEXT:    buffer_store_short v2, off, s[0:3], 0 offset:4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_v3i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
@@ -4713,7 +4685,6 @@ define amdgpu_kernel void @udiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
 ; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_v3i15:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
@@ -4908,7 +4879,6 @@ define amdgpu_kernel void @urem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
 ; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_v3i15:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
@@ -5123,7 +5093,6 @@ define amdgpu_kernel void @sdiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
 ; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_v3i15:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
@@ -5360,7 +5329,6 @@ define amdgpu_kernel void @srem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
 ; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_v3i15:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
@@ -5464,7 +5432,6 @@ define amdgpu_kernel void @udiv_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) {
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 20, v0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_i32_oddk_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -5501,7 +5468,6 @@ define amdgpu_kernel void @udiv_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) {
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -5537,7 +5503,6 @@ define amdgpu_kernel void @udiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -5579,7 +5544,6 @@ define amdgpu_kernel void @udiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_v2i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -5625,7 +5589,6 @@ define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, <
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_v2i32_mixed_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -5771,7 +5734,6 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_v2i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
@@ -5854,7 +5816,6 @@ define amdgpu_kernel void @urem_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) {
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_i32_oddk_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -5893,7 +5854,6 @@ define amdgpu_kernel void @urem_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) {
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -5930,7 +5890,6 @@ define amdgpu_kernel void @urem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -5973,7 +5932,6 @@ define amdgpu_kernel void @urem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_v2i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -6105,7 +6063,6 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_v2i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
@@ -6181,7 +6138,6 @@ define amdgpu_kernel void @sdiv_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) {
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_i32_oddk_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -6221,7 +6177,6 @@ define amdgpu_kernel void @sdiv_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) {
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -6289,7 +6244,6 @@ define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
 ; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -6365,7 +6319,6 @@ define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_v2i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -6420,7 +6373,6 @@ define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out,
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: ssdiv_v2i32_mixed_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -6605,7 +6557,6 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s4, v1
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_v2i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
@@ -6707,7 +6658,6 @@ define amdgpu_kernel void @srem_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) {
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_i32_oddk_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -6750,7 +6700,6 @@ define amdgpu_kernel void @srem_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) {
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -6813,7 +6762,6 @@ define amdgpu_kernel void @srem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -6888,7 +6836,6 @@ define amdgpu_kernel void @srem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_v2i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -7060,7 +7007,6 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_v2i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
@@ -7134,7 +7080,7 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 define amdgpu_kernel void @udiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
 ; CHECK-LABEL: @udiv_i64_oddk_denom(
 ; CHECK-NEXT:    [[R:%.*]] = udiv i64 [[X:%.*]], 1235195949943
-; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: udiv_i64_oddk_denom:
@@ -7233,7 +7179,6 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_i64_oddk_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
@@ -7344,7 +7289,7 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
 define amdgpu_kernel void @udiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
 ; CHECK-LABEL: @udiv_i64_pow2k_denom(
 ; CHECK-NEXT:    [[R:%.*]] = udiv i64 [[X:%.*]], 4096
-; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: udiv_i64_pow2k_denom:
@@ -7360,7 +7305,6 @@ define amdgpu_kernel void @udiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_i64_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -7380,7 +7324,7 @@ define amdgpu_kernel void @udiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; CHECK-LABEL: @udiv_i64_pow2_shl_denom(
 ; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = udiv i64 [[X:%.*]], [[SHL_Y]]
-; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: udiv_i64_pow2_shl_denom:
@@ -7398,7 +7342,6 @@ define amdgpu_kernel void @udiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x34
@@ -7443,7 +7386,6 @@ define amdgpu_kernel void @udiv_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_v2i64_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
@@ -7560,7 +7502,6 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, <
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_v2i64_mixed_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
@@ -7692,7 +7633,6 @@ define amdgpu_kernel void @udiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_v2i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
@@ -7718,7 +7658,7 @@ define amdgpu_kernel void @udiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 define amdgpu_kernel void @urem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
 ; CHECK-LABEL: @urem_i64_oddk_denom(
 ; CHECK-NEXT:    [[R:%.*]] = urem i64 [[X:%.*]], 1235195393993
-; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: urem_i64_oddk_denom:
@@ -7816,7 +7756,6 @@ define amdgpu_kernel void @urem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_i64_oddk_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
@@ -7925,7 +7864,7 @@ define amdgpu_kernel void @urem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
 define amdgpu_kernel void @urem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
 ; CHECK-LABEL: @urem_i64_pow2k_denom(
 ; CHECK-NEXT:    [[R:%.*]] = urem i64 [[X:%.*]], 4096
-; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: urem_i64_pow2k_denom:
@@ -7941,7 +7880,6 @@ define amdgpu_kernel void @urem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_i64_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -7960,7 +7898,7 @@ define amdgpu_kernel void @urem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; CHECK-LABEL: @urem_i64_pow2_shl_denom(
 ; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = urem i64 [[X:%.*]], [[SHL_Y]]
-; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: urem_i64_pow2_shl_denom:
@@ -7981,7 +7919,6 @@ define amdgpu_kernel void @urem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x34
@@ -8029,7 +7966,6 @@ define amdgpu_kernel void @urem_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_v2i64_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
@@ -8084,7 +8020,6 @@ define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_v2i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
@@ -8115,7 +8050,7 @@ define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 define amdgpu_kernel void @sdiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
 ; CHECK-LABEL: @sdiv_i64_oddk_denom(
 ; CHECK-NEXT:    [[R:%.*]] = sdiv i64 [[X:%.*]], 1235195
-; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: sdiv_i64_oddk_denom:
@@ -8212,7 +8147,6 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_i64_oddk_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_mov_b32 s4, 0x33fe64
@@ -8315,7 +8249,7 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
 define amdgpu_kernel void @sdiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
 ; CHECK-LABEL: @sdiv_i64_pow2k_denom(
 ; CHECK-NEXT:    [[R:%.*]] = sdiv i64 [[X:%.*]], 4096
-; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: sdiv_i64_pow2k_denom:
@@ -8335,7 +8269,6 @@ define amdgpu_kernel void @sdiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_i64_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -8359,7 +8292,7 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; CHECK-LABEL: @sdiv_i64_pow2_shl_denom(
 ; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = sdiv i64 [[X:%.*]], [[SHL_Y]]
-; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: sdiv_i64_pow2_shl_denom:
@@ -8498,7 +8431,6 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x34
@@ -8690,7 +8622,6 @@ define amdgpu_kernel void @sdiv_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_v2i64_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
@@ -8829,7 +8760,6 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out,
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: ssdiv_v2i64_mixed_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
@@ -9208,7 +9138,6 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_v2i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
@@ -9510,7 +9439,7 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 define amdgpu_kernel void @srem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
 ; CHECK-LABEL: @srem_i64_oddk_denom(
 ; CHECK-NEXT:    [[R:%.*]] = srem i64 [[X:%.*]], 1235195
-; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: srem_i64_oddk_denom:
@@ -9605,7 +9534,6 @@ define amdgpu_kernel void @srem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_i64_oddk_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_mov_b32 s4, 0x33fe64
@@ -9711,7 +9639,7 @@ define amdgpu_kernel void @srem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
 define amdgpu_kernel void @srem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
 ; CHECK-LABEL: @srem_i64_pow2k_denom(
 ; CHECK-NEXT:    [[R:%.*]] = srem i64 [[X:%.*]], 4096
-; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: srem_i64_pow2k_denom:
@@ -9733,7 +9661,6 @@ define amdgpu_kernel void @srem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_i64_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -9759,7 +9686,7 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; CHECK-LABEL: @srem_i64_pow2_shl_denom(
 ; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = srem i64 [[X:%.*]], [[SHL_Y]]
-; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: srem_i64_pow2_shl_denom:
@@ -9896,7 +9823,6 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x34
@@ -10089,7 +10015,6 @@ define amdgpu_kernel void @srem_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_v2i64_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
@@ -10388,7 +10313,6 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_v2i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.defined.nobuiltin.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.defined.nobuiltin.ll
index b94304ec2b2ac6c..d85f4b8bb6ceeae 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.defined.nobuiltin.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.defined.nobuiltin.ll
@@ -56,12 +56,13 @@ define void @sincos_f32(float %x, ptr addrspace(1) nocapture writeonly %sin_out,
 ; CHECK-LABEL: define void @sincos_f32
 ; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca float, align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[__SINCOS_]])
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[__SINCOS_]], align 4
-; CHECK-NEXT:    store float [[TMP0]], ptr addrspace(1) [[SIN_OUT]], align 4
+; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5)
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4
+; CHECK-NEXT:    store float [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 4
 ; CHECK-NEXT:    [[CALL1:%.*]] = tail call contract float @_Z3cosf(float [[X]])
-; CHECK-NEXT:    store float [[TMP1]], ptr addrspace(1) [[COS_OUT]], align 4
+; CHECK-NEXT:    store float [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:

diff  --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
index c8ac5bcf8d22242..a516d9ecdaf0e4c 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
@@ -166,13 +166,13 @@ define void @use_dispatch_id() #1 {
 ; AKF_HSA-LABEL: define {{[^@]+}}@use_dispatch_id
 ; AKF_HSA-SAME: () #[[ATTR1]] {
 ; AKF_HSA-NEXT:    [[VAL:%.*]] = call i64 @llvm.amdgcn.dispatch.id()
-; AKF_HSA-NEXT:    store volatile i64 [[VAL]], ptr addrspace(1) undef, align 4
+; AKF_HSA-NEXT:    store volatile i64 [[VAL]], ptr addrspace(1) undef, align 8
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_dispatch_id
 ; ATTRIBUTOR_HSA-SAME: () #[[ATTR9:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[VAL:%.*]] = call i64 @llvm.amdgcn.dispatch.id()
-; ATTRIBUTOR_HSA-NEXT:    store volatile i64 [[VAL]], ptr addrspace(1) undef, align 4
+; ATTRIBUTOR_HSA-NEXT:    store volatile i64 [[VAL]], ptr addrspace(1) undef, align 8
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
   %val = call i64 @llvm.amdgcn.dispatch.id()

diff  --git a/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll b/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll
index b91dcd0bb9a01c6..c355023270c19e4 100644
--- a/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll
@@ -42,7 +42,7 @@ define void @use_everything_else() {
 ; CHECK-NEXT:    [[VAL8:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
 ; CHECK-NEXT:    store volatile i32 [[VAL8]], ptr addrspace(1) null, align 4
 ; CHECK-NEXT:    [[VAL9:%.*]] = call i64 @llvm.amdgcn.dispatch.id()
-; CHECK-NEXT:    store volatile i64 [[VAL9]], ptr addrspace(1) null, align 4
+; CHECK-NEXT:    store volatile i64 [[VAL9]], ptr addrspace(1) null, align 8
 ; CHECK-NEXT:    ret void
 ;
   %val0 = call i32 @llvm.amdgcn.workitem.id.x()

diff  --git a/llvm/test/CodeGen/AMDGPU/lds-reject-absolute-addresses.ll b/llvm/test/CodeGen/AMDGPU/lds-reject-absolute-addresses.ll
index 753f4a8a06fe9fb..659cdb55ded2fe9 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-reject-absolute-addresses.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-reject-absolute-addresses.ll
@@ -11,5 +11,5 @@ define amdgpu_kernel void @kern() {
   ret void
 }
 
-!0 = !{i64 0, i64 1}
+!0 = !{i32 0, i32 1}
 

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor-constexpr-alias.ll b/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor-constexpr-alias.ll
index b97acaa8d7b35b5..a1929a2e8931c11 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor-constexpr-alias.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor-constexpr-alias.ll
@@ -34,7 +34,7 @@ define void @bar() addrspace(1) {
 ; CHECK: @[[__INIT_ARRAY_END:[a-zA-Z0-9_$"\\.-]+]] = external addrspace(1) constant [0 x ptr addrspace(1)]
 ; CHECK: @[[__FINI_ARRAY_START:[a-zA-Z0-9_$"\\.-]+]] = external addrspace(1) constant [0 x ptr addrspace(1)]
 ; CHECK: @[[__FINI_ARRAY_END:[a-zA-Z0-9_$"\\.-]+]] = external addrspace(1) constant [0 x ptr addrspace(1)]
-; CHECK: @[[LLVM_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [2 x ptr] [ptr @amdgcn.device.init, ptr @amdgcn.device.fini], section "llvm.metadata"
+; CHECK: @[[LLVM_USED:[a-zA-Z0-9_$"\\.-]+]] = appending addrspace(1) global [2 x ptr] [ptr @amdgcn.device.init, ptr @amdgcn.device.fini], section "llvm.metadata"
 ; CHECK: @[[FOO_ALIAS:[a-zA-Z0-9_$"\\.-]+]] = hidden alias void (), ptr @foo
 ;.
 ; CHECK-LABEL: define void @foo(

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll b/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll
index aca5886bce5f781..968442182229723 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll
@@ -51,7 +51,7 @@ define internal void @bar() {
 ; CHECK: @[[__INIT_ARRAY_END:[a-zA-Z0-9_$"\\.-]+]] = external addrspace(1) constant [0 x ptr addrspace(1)]
 ; CHECK: @[[__FINI_ARRAY_START:[a-zA-Z0-9_$"\\.-]+]] = external addrspace(1) constant [0 x ptr addrspace(1)]
 ; CHECK: @[[__FINI_ARRAY_END:[a-zA-Z0-9_$"\\.-]+]] = external addrspace(1) constant [0 x ptr addrspace(1)]
-; CHECK: @[[LLVM_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [2 x ptr] [ptr @amdgcn.device.init, ptr @amdgcn.device.fini], section "llvm.metadata"
+; CHECK: @[[LLVM_USED:[a-zA-Z0-9_$"\\.-]+]] = appending addrspace(1) global [2 x ptr] [ptr @amdgcn.device.init, ptr @amdgcn.device.fini], section "llvm.metadata"
 ;.
 ; CHECK-LABEL: define internal void @foo() {
 ; CHECK-NEXT:    ret void

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll b/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll
index 8c3b1bc3e6eaddc..eefa0b23d0c08ac 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll
@@ -15,7 +15,7 @@
 
 ;.
 ; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t poison, align 8, !absolute_symbol !0
-; CHECK: @llvm.compiler.used = appending global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata"
+; CHECK: @llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata"
 ; CHECK: @llvm.amdgcn.kernel.k0.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k0.lds.t poison, align 16, !absolute_symbol !0
 ; CHECK: @llvm.amdgcn.kernel.k1.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k1.lds.t poison, align 16, !absolute_symbol !0
 ; CHECK: @llvm.amdgcn.kernel.k2.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k2.lds.t poison, align 2, !absolute_symbol !0
@@ -99,4 +99,4 @@ define void @f0() {
 ; CHECK: attributes #3 = { "amdgpu-lds-size"="4" }
 ; CHECK: attributes #4 = { "amdgpu-lds-size"="9" }
 
-; CHECK: !0 = !{i64 0, i64 1}
+; CHECK: !0 = !{i32 0, i32 1}

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-super-align.ll b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-super-align.ll
index 95bf1f931c405d7..61c62c4a00f1069 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-super-align.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-super-align.ll
@@ -24,7 +24,7 @@
 ; SUPER-ALIGN_ON:  @llvm.amdgcn.kernel.k3.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k3.lds.t poison, align 16
 ; SUPER-ALIGN_OFF: @llvm.amdgcn.kernel.k3.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k3.lds.t poison, align 8
 
-; SUPER-ALIGN_ON:  @llvm.amdgcn.kernel.k4.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k4.lds.t poison, align 16
+; SUPER-ALIGN_ON:  @llvm.amdgcn.kernel.k4.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k4.lds.t poison, align 8
 ; SUPER-ALIGN_OFF: @llvm.amdgcn.kernel.k4.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k4.lds.t poison, align 4
 
 ; CHECK-LABEL: @k1
@@ -133,11 +133,9 @@ define amdgpu_kernel void @k3(i64 %x) {
 ; Check that aligment is not propagated if use is not a pointer operand.
 
 ; CHECK-LABEL: @k4
-; SUPER-ALIGN_ON:  store i32 poison, ptr addrspace(3) %gep, align 8
-; SUPER-ALIGN_OFF: store i32 poison, ptr addrspace(3) %gep, align 4
+; CHECK:           store i32 poison, ptr addrspace(3) %gep, align 4
 ; CHECK:           store ptr addrspace(3) %gep, ptr poison, align 4
-; SUPER-ALIGN_ON:  %val1 = cmpxchg volatile ptr addrspace(3) %gep, i32 1, i32 2 monotonic monotonic, align 8
-; SUPER-ALIGN_OFF: %val1 = cmpxchg volatile ptr addrspace(3) %gep, i32 1, i32 2 monotonic monotonic, align 4
+; CHECK:           %val1 = cmpxchg volatile ptr addrspace(3) %gep, i32 1, i32 2 monotonic monotonic, align 4
 ; CHECK:           %val2 = cmpxchg volatile ptr poison, ptr addrspace(3) %gep, ptr addrspace(3) poison monotonic monotonic, align 4
 define amdgpu_kernel void @k4() {
   %gep = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) @lds.6, i64 1

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds.ll b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds.ll
index 9013fdf64792154..70142fa4b5b2955 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds.ll
@@ -62,4 +62,4 @@ define amdgpu_ps void @k2() {
   ret void
 }
 
-; CHECK: !0 = !{i64 0, i64 1}
+; CHECK: !0 = !{i32 0, i32 1}

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
index 48f533093858129..2547aacddb18e72 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
@@ -69,7 +69,7 @@ declare void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) noalias nocapture writeonly
 ; CHECK: attributes #[[ATTR0]] = { "amdgpu-lds-size"="7" }
 ; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
 ;.
-; CHECK: [[META0:![0-9]+]] = !{i64 0, i64 1}
+; CHECK: [[META0:![0-9]+]] = !{i32 0, i32 1}
 ; CHECK: [[META1:![0-9]+]] = !{!2}
 ; CHECK: [[META2:![0-9]+]] = distinct !{!2, !3}
 ; CHECK: [[META3:![0-9]+]] = distinct !{!3}

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll
index 6e13cfb00a4b35b..d2d15f5ca457726 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll
@@ -42,7 +42,7 @@ bb:
 !8 = !{!"omnipotent char", !9, i64 0}
 !9 = !{!"Simple C++ TBAA"}
 
-; CHECK:!0 = !{i64 0, i64 1}
+; CHECK:!0 = !{i32 0, i32 1}
 ; CHECK:!1 = !{!2, !3, i64 0}
 ; CHECK:!2 = !{!"no_clobber_ds_load_stores_x2_preexisting_aa", !3, i64 0}
 ; CHECK:!3 = !{!"int", !4, i64 0}

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll
index fe88b7770e09c73..868ac42dbe8fccb 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll
@@ -110,7 +110,7 @@ bb:
   ret void
 }
 
-; CHECK: !0 = !{i64 0, i64 1}
+; CHECK: !0 = !{i32 0, i32 1}
 ; CHECK: !1 = !{!2}
 ; CHECK: !2 = distinct !{!2, !3}
 ; CHECK: !3 = distinct !{!3}

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect-extern-uses-max-reachable-alignment.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect-extern-uses-max-reachable-alignment.ll
index c999332cb654237..efb4fc0fc2678f9 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect-extern-uses-max-reachable-alignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect-extern-uses-max-reachable-alignment.ll
@@ -16,7 +16,7 @@
 ; CHECK: @dynamic_kernel_only = external addrspace(3) global [0 x double]
 ; CHECK: @dynamic_shared8 = external addrspace(3) global [0 x i64], align 8
 ; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t poison, align 4, !absolute_symbol !0
-; CHECK: @llvm.compiler.used = appending global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata"
+; CHECK: @llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata"
 
 ; Alignment of these must be the maximum of the alignment of the reachable symbols
 ; CHECK: @llvm.amdgcn.expect_align1.dynlds = external addrspace(3) global [0 x i8], align 1, !absolute_symbol !0
@@ -103,7 +103,7 @@ define void @use_shared8() #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[DYNAMIC_SHARED8]], align 4
 ; CHECK-NEXT:    [[DYNAMIC_SHARED81:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3)
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i64], ptr addrspace(3) [[DYNAMIC_SHARED81]], i32 0, i32 7
-; CHECK-NEXT:    store i64 3, ptr addrspace(3) [[ARRAYIDX]], align 4
+; CHECK-NEXT:    store i64 3, ptr addrspace(3) [[ARRAYIDX]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %arrayidx = getelementptr inbounds [0 x i64], ptr addrspace(3) @dynamic_shared8, i32 0, i32 7
@@ -149,7 +149,7 @@ define amdgpu_kernel void @expect_align8() {
 ; CHECK-LABEL: define amdgpu_kernel void @expect_align8() !llvm.amdgcn.lds.kernel.id !5 {
 ; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.expect_align8.dynlds) ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i64], ptr addrspace(3) @dynamic_shared8, i32 0, i32 9
-; CHECK-NEXT:    store i64 3, ptr addrspace(3) [[ARRAYIDX]], align 4
+; CHECK-NEXT:    store i64 3, ptr addrspace(3) [[ARRAYIDX]], align 8
 ; CHECK-NEXT:    call void @use_shared8()
 ; CHECK-NEXT:    ret void
 ;
@@ -188,8 +188,8 @@ attributes #0 = { noinline }
 ; CHECK: attributes #2 = { nocallback nofree nosync nounwind willreturn memory(none) }
 ; CHECK: attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 
-; CHECK: !0 = !{i64 0, i64 1}
-; CHECK: !1 = !{i64 4, i64 5}
+; CHECK: !0 = !{i32 0, i32 1}
+; CHECK: !1 = !{i32 4, i32 5}
 ; CHECK: !2 = !{i32 0}
 ; CHECK: !3 = !{i32 1}
 ; CHECK: !4 = !{i32 2}

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll
index cb34d48875d1031..6c504315e8d7aed 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll
@@ -26,7 +26,7 @@
 @llvm.used = appending global [2 x ptr] [ptr addrspacecast (ptr addrspace(3) @tolower to ptr), ptr addrspacecast (ptr addrspace(1) @ignored to ptr)], section "llvm.metadata"
 
 ; @ignored still in list, @tolower removed, llvm.amdgcn.module.lds appended
-; CHECK: @llvm.compiler.used = appending global [2 x ptr] [ptr addrspacecast (ptr addrspace(1) @ignored to ptr), ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata"
+; CHECK: @llvm.compiler.used = appending addrspace(1) global [2 x ptr] [ptr addrspacecast (ptr addrspace(1) @ignored to ptr), ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata"
 
 @llvm.compiler.used = appending global [2 x ptr] [ptr addrspacecast (ptr addrspace(3) @tolower to ptr), ptr addrspacecast (ptr addrspace(1) @ignored to ptr)], section "llvm.metadata"
 

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
index 00a099e9b9c161b..66f65a77dfeb3ca 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -mtriple=amdgcn--amdhsa -passes=amdgpu-lower-module-lds < %s --amdgpu-lower-module-lds-strategy=hybrid | FileCheck -check-prefix=OPT %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s --amdgpu-lower-module-lds-strategy=hybrid | FileCheck -check-prefix=GCN %s
 
@@ -12,7 +12,7 @@
 @unused = addrspace(3) global i16 poison
 
 ; OPT: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t poison, align 16, !absolute_symbol !0
-; OPT: @llvm.compiler.used = appending global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata"
+; OPT: @llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata"
 ; OPT: @llvm.amdgcn.kernel.kernel_no_table.lds = internal addrspace(3) global %llvm.amdgcn.kernel.kernel_no_table.lds.t poison, align 8, !absolute_symbol !0
 ; OPT: @llvm.amdgcn.kernel.k01.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k01.lds.t poison, align 4, !absolute_symbol !1
 ; OPT: @llvm.amdgcn.kernel.k23.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k23.lds.t poison, align 8, !absolute_symbol !0
@@ -73,12 +73,12 @@ define void @f2() {
 ; OPT-NEXT:    [[V22:%.*]] = getelementptr inbounds [2 x [1 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
 ; OPT-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[V22]], align 4
 ; OPT-NEXT:    [[V23:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3)
-; OPT-NEXT:    [[LD:%.*]] = load i64, ptr addrspace(3) [[V23]], align 4
+; OPT-NEXT:    [[LD:%.*]] = load i64, ptr addrspace(3) [[V23]], align 8
 ; OPT-NEXT:    [[MUL:%.*]] = mul i64 [[LD]], 4
 ; OPT-NEXT:    [[V2:%.*]] = getelementptr inbounds [2 x [1 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
 ; OPT-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[V2]], align 4
 ; OPT-NEXT:    [[V21:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(3)
-; OPT-NEXT:    store i64 [[MUL]], ptr addrspace(3) [[V21]], align 4
+; OPT-NEXT:    store i64 [[MUL]], ptr addrspace(3) [[V21]], align 8
 ; OPT-NEXT:    ret void
 ;
 ; GCN-LABEL: f2:
@@ -193,7 +193,7 @@ define amdgpu_kernel void @k01() {
 
 define amdgpu_kernel void @k23() {
 ; OPT-LABEL: @k23(
-; OPT-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k23.lds) ], !alias.scope [[META4:![0-9]+]], !noalias [[META7:![0-9]+]]
+; OPT-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k23.lds) ], !alias.scope !4, !noalias !7
 ; OPT-NEXT:    call void @f2()
 ; OPT-NEXT:    call void @f3()
 ; OPT-NEXT:    ret void
@@ -231,12 +231,12 @@ define amdgpu_kernel void @k23() {
 ; Access and allocate three variables
 define amdgpu_kernel void @k123() {
 ; OPT-LABEL: @k123(
-; OPT-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds) ], !alias.scope [[META10:![0-9]+]], !noalias [[META13:![0-9]+]]
+; OPT-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds) ], !alias.scope !10, !noalias !13
 ; OPT-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ]
 ; OPT-NEXT:    call void @f1()
-; OPT-NEXT:    [[LD:%.*]] = load i8, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 8, !alias.scope [[META13]], !noalias [[META10]]
+; OPT-NEXT:    [[LD:%.*]] = load i8, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 8, !alias.scope !13, !noalias !10
 ; OPT-NEXT:    [[MUL:%.*]] = mul i8 [[LD]], 8
-; OPT-NEXT:    store i8 [[MUL]], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 8, !alias.scope [[META13]], !noalias [[META10]]
+; OPT-NEXT:    store i8 [[MUL]], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 8, !alias.scope !13, !noalias !10
 ; OPT-NEXT:    call void @f2()
 ; OPT-NEXT:    ret void
 ;
@@ -285,14 +285,14 @@ define amdgpu_kernel void @k123() {
 
 
 ; OPT: attributes #0 = { "amdgpu-lds-size"="8" }
-; OPT: attributes #1 = { "amdgpu-lds-size"="12" }
-; OPT: attributes #2 = { "amdgpu-lds-size"="20" }
+; OPT: attributes #1 = { "amdgpu-lds-size"="16" }
+; OPT: attributes #2 = { "amdgpu-lds-size"="24" }
 ; OPT: attributes #3 = { nocallback nofree nosync nounwind willreturn memory(none) }
 ; OPT: attributes #4 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 
-; OPT: !0 = !{i64 0, i64 1}
-; OPT: !1 = !{i64 4, i64 5}
-; OPT: !2 = !{i64 8, i64 9}
+; OPT: !0 = !{i32 0, i32 1}
+; OPT: !1 = !{i32 4, i32 5}
+; OPT: !2 = !{i32 8, i32 9}
 ; OPT: !3 = !{i32 1}
 ; OPT: !4 = !{!5}
 ; OPT: !5 = distinct !{!5, !6}

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
index 82004b84275e6f1..f1545ed267be651 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
@@ -118,12 +118,12 @@ define void @f2() {
 ; OPT-NEXT:    [[V22:%.*]] = getelementptr inbounds [3 x [4 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 2
 ; OPT-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[V22]], align 4
 ; OPT-NEXT:    [[V23:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3)
-; OPT-NEXT:    [[LD:%.*]] = load i64, ptr addrspace(3) [[V23]], align 4
+; OPT-NEXT:    [[LD:%.*]] = load i64, ptr addrspace(3) [[V23]], align 8
 ; OPT-NEXT:    [[MUL:%.*]] = mul i64 [[LD]], 4
 ; OPT-NEXT:    [[V2:%.*]] = getelementptr inbounds [3 x [4 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 2
 ; OPT-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[V2]], align 4
 ; OPT-NEXT:    [[V21:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(3)
-; OPT-NEXT:    store i64 [[MUL]], ptr addrspace(3) [[V21]], align 4
+; OPT-NEXT:    store i64 [[MUL]], ptr addrspace(3) [[V21]], align 8
 ; OPT-NEXT:    ret void
 ;
 ; GCN-LABEL: f2:
@@ -300,7 +300,7 @@ define amdgpu_kernel void @k23() {
 ; Access and allocate three variables
 define amdgpu_kernel void @k123() {
 ; OPT-LABEL: define amdgpu_kernel void @k123(
-; OPT-SAME: ) #[[ATTR2:[0-9]+]] !llvm.amdgcn.lds.kernel.id !13 {
+; OPT-SAME: ) #[[ATTR1]] !llvm.amdgcn.lds.kernel.id !13 {
 ; OPT-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds) ], !alias.scope !14, !noalias !17
 ; OPT-NEXT:    call void @f1()
 ; OPT-NEXT:    [[LD:%.*]] = load i8, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 2, !alias.scope !20, !noalias !21
@@ -352,8 +352,7 @@ define amdgpu_kernel void @k123() {
 ; OPT: declare i32 @llvm.amdgcn.lds.kernel.id()
 
 ; OPT: attributes #0 = { "amdgpu-lds-size"="8" }
-; OPT: attributes #1 = { "amdgpu-lds-size"="12" }
-; OPT: attributes #2 = { "amdgpu-lds-size"="16" }
+; OPT: attributes #1 = { "amdgpu-lds-size"="16" }
 
 !0 = !{i64 0, i64 1}
 !1 = !{i32 0}

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-multiple-ctor-dtor.ll b/llvm/test/CodeGen/AMDGPU/lower-multiple-ctor-dtor.ll
index 7aa1d212ed417ab..83bb61d1a632351 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-multiple-ctor-dtor.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-multiple-ctor-dtor.ll
@@ -11,7 +11,7 @@
 ; CHECK: @__init_array_end = external addrspace(1) constant [0 x ptr addrspace(1)]
 ; CHECK: @__fini_array_start = external addrspace(1) constant [0 x ptr addrspace(1)]
 ; CHECK: @__fini_array_end = external addrspace(1) constant [0 x ptr addrspace(1)]
-; CHECK: @llvm.used = appending global [2 x ptr] [ptr @amdgcn.device.init, ptr @amdgcn.device.fini]
+; CHECK: @llvm.used = appending addrspace(1) global [2 x ptr] [ptr @amdgcn.device.init, ptr @amdgcn.device.fini]
 ; UTC_ARGS: --enable
 
 

diff  --git a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
index 17bbcde2cc62afa..637805122b1296b 100644
--- a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
+++ b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
@@ -50,7 +50,7 @@ define amdgpu_kernel void @reduced_nested_loop_conditions(ptr addrspace(3) nocap
 ; IR-NEXT:  bb:
 ; IR-NEXT:    [[MY_TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() #[[ATTR4:[0-9]+]]
 ; IR-NEXT:    [[MY_TMP1:%.*]] = getelementptr inbounds i64, ptr addrspace(3) [[ARG:%.*]], i32 [[MY_TMP]]
-; IR-NEXT:    [[MY_TMP2:%.*]] = load volatile i64, ptr addrspace(3) [[MY_TMP1]], align 4
+; IR-NEXT:    [[MY_TMP2:%.*]] = load volatile i64, ptr addrspace(3) [[MY_TMP1]], align 8
 ; IR-NEXT:    br label [[BB5:%.*]]
 ; IR:       bb3:
 ; IR-NEXT:    br i1 true, label [[BB4:%.*]], label [[BB13:%.*]]
@@ -93,6 +93,7 @@ define amdgpu_kernel void @reduced_nested_loop_conditions(ptr addrspace(3) nocap
 ; IR:       bb23:
 ; IR-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP6]])
 ; IR-NEXT:    ret void
+;
 bb:
   %my.tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %my.tmp1 = getelementptr inbounds i64, ptr addrspace(3) %arg, i32 %my.tmp
@@ -276,6 +277,7 @@ define amdgpu_kernel void @nested_loop_conditions(ptr addrspace(1) nocapture %ar
 ; IR-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP7]])
 ; IR-NEXT:    store volatile i32 0, ptr addrspace(1) undef, align 4
 ; IR-NEXT:    ret void
+;
 bb:
   %my.tmp1134 = load volatile i32, ptr addrspace(1) undef
   %my.tmp1235 = icmp slt i32 %my.tmp1134, 9

diff  --git a/llvm/test/CodeGen/AMDGPU/opencl-printf.ll b/llvm/test/CodeGen/AMDGPU/opencl-printf.ll
index caf260a083ccbae..a1f0a5da125df37 100644
--- a/llvm/test/CodeGen/AMDGPU/opencl-printf.ll
+++ b/llvm/test/CodeGen/AMDGPU/opencl-printf.ll
@@ -86,7 +86,7 @@ define amdgpu_kernel void @format_str_f(float %f32.0, double %f64, float %f32.1,
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR5:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR4]], i32 4
 ; GCN-NEXT:    store i32 [[I32:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR5]], align 4
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR6:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR5]], i32 4
-; GCN-NEXT:    store i64 [[I64:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], align 4
+; GCN-NEXT:    store i64 [[I64:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], align 8
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR7:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], i32 8
 ; GCN-NEXT:    store <2 x float> <float 1.000000e+00, float 2.000000e+00>, ptr addrspace(1) [[PRINTBUFFNEXTPTR7]], align 8
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR8:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR7]], i32 8
@@ -108,7 +108,7 @@ define void @format_str_ptr(ptr %ptr.flat, ptr addrspace(3) %ptr.lds, ptr addrsp
 ; R600-NEXT:    ret void
 ;
 ; GCN-LABEL: @format_str_ptr(
-; GCN-NEXT:    [[PRINTF_ALLOC_FN:%.*]] = call ptr addrspace(1) @__printf_alloc(i32 44)
+; GCN-NEXT:    [[PRINTF_ALLOC_FN:%.*]] = call ptr addrspace(1) @__printf_alloc(i32 36)
 ; GCN-NEXT:    br label [[DOTSPLIT:%.*]]
 ; GCN:       .split:
 ; GCN-NEXT:    [[TMP1:%.*]] = icmp ne ptr addrspace(1) [[PRINTF_ALLOC_FN]], null
@@ -120,12 +120,12 @@ define void @format_str_ptr(ptr %ptr.flat, ptr addrspace(3) %ptr.lds, ptr addrsp
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    store ptr [[PTR_FLAT:%.*]], ptr addrspace(1) [[PRINTBUFFGEP]], align 8
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFGEP]], i32 8
-; GCN-NEXT:    store ptr addrspace(3) [[PTR_LDS:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR]], align 8
-; GCN-NEXT:    [[PRINTBUFFNEXTPTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR]], i32 8
+; GCN-NEXT:    store ptr addrspace(3) [[PTR_LDS:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR]], align 4
+; GCN-NEXT:    [[PRINTBUFFNEXTPTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR]], i32 4
 ; GCN-NEXT:    store ptr addrspace(1) [[PTR_GLOBAL:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR1]], align 8
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR2:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR1]], i32 8
-; GCN-NEXT:    store ptr addrspace(5) [[PTR_STACK:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR2]], align 8
-; GCN-NEXT:    [[PRINTBUFFNEXTPTR3:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR2]], i32 8
+; GCN-NEXT:    store ptr addrspace(5) [[PTR_STACK:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR2]], align 4
+; GCN-NEXT:    [[PRINTBUFFNEXTPTR3:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR2]], i32 4
 ; GCN-NEXT:    store ptr addrspace(4) [[PTR_CONST:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR3]], align 8
 ; GCN-NEXT:    br label [[TMP3]]
 ; GCN:       3:
@@ -145,7 +145,7 @@ define amdgpu_kernel void @format_str_d(i1 %i1, i4 %i4, i8 %i8, i24 %i24, i16 %i
 ; GCN-NEXT:    [[TMP2:%.*]] = sext i4 [[I4:%.*]] to i32
 ; GCN-NEXT:    [[TMP3:%.*]] = sext i8 [[I8:%.*]] to i32
 ; GCN-NEXT:    [[TMP4:%.*]] = sext i16 [[I16:%.*]] to i32
-; GCN-NEXT:    [[PRINTF_ALLOC_FN:%.*]] = call ptr addrspace(1) @__printf_alloc(i32 68)
+; GCN-NEXT:    [[PRINTF_ALLOC_FN:%.*]] = call ptr addrspace(1) @__printf_alloc(i32 72)
 ; GCN-NEXT:    br label [[DOTSPLIT:%.*]]
 ; GCN:       .split:
 ; GCN-NEXT:    [[TMP5:%.*]] = icmp ne ptr addrspace(1) [[PRINTF_ALLOC_FN]], null
@@ -167,11 +167,11 @@ define amdgpu_kernel void @format_str_d(i1 %i1, i4 %i4, i8 %i8, i24 %i24, i16 %i
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR4:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR3]], i32 4
 ; GCN-NEXT:    store i32 [[I32:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR4]], align 4
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR5:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR4]], i32 4
-; GCN-NEXT:    store i64 [[I64:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR5]], align 4
+; GCN-NEXT:    store i64 [[I64:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR5]], align 8
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR6:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR5]], i32 8
-; GCN-NEXT:    store i96 [[I96:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], align 4
-; GCN-NEXT:    [[PRINTBUFFNEXTPTR7:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], i32 12
-; GCN-NEXT:    store i128 [[I128:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR7]], align 4
+; GCN-NEXT:    store i96 [[I96:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], align 8
+; GCN-NEXT:    [[PRINTBUFFNEXTPTR7:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], i32 16
+; GCN-NEXT:    store i128 [[I128:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR7]], align 8
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR8:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR7]], i32 16
 ; GCN-NEXT:    store i32 1234, ptr addrspace(1) [[PRINTBUFFNEXTPTR8]], align 4
 ; GCN-NEXT:    br label [[TMP7]]
@@ -192,7 +192,7 @@ define amdgpu_kernel void @format_str_u(i1 %i1, i4 %i4, i8 %i8, i24 %i24, i16 %i
 ; GCN-NEXT:    [[TMP2:%.*]] = zext i4 [[I4:%.*]] to i32
 ; GCN-NEXT:    [[TMP3:%.*]] = zext i8 [[I8:%.*]] to i32
 ; GCN-NEXT:    [[TMP4:%.*]] = zext i16 [[I16:%.*]] to i32
-; GCN-NEXT:    [[PRINTF_ALLOC_FN:%.*]] = call ptr addrspace(1) @__printf_alloc(i32 68)
+; GCN-NEXT:    [[PRINTF_ALLOC_FN:%.*]] = call ptr addrspace(1) @__printf_alloc(i32 72)
 ; GCN-NEXT:    br label [[DOTSPLIT:%.*]]
 ; GCN:       .split:
 ; GCN-NEXT:    [[TMP5:%.*]] = icmp ne ptr addrspace(1) [[PRINTF_ALLOC_FN]], null
@@ -214,11 +214,11 @@ define amdgpu_kernel void @format_str_u(i1 %i1, i4 %i4, i8 %i8, i24 %i24, i16 %i
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR4:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR3]], i32 4
 ; GCN-NEXT:    store i32 [[I32:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR4]], align 4
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR5:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR4]], i32 4
-; GCN-NEXT:    store i64 [[I64:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR5]], align 4
+; GCN-NEXT:    store i64 [[I64:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR5]], align 8
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR6:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR5]], i32 8
-; GCN-NEXT:    store i96 [[I96:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], align 4
-; GCN-NEXT:    [[PRINTBUFFNEXTPTR7:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], i32 12
-; GCN-NEXT:    store i128 [[I128:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR7]], align 4
+; GCN-NEXT:    store i96 [[I96:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], align 8
+; GCN-NEXT:    [[PRINTBUFFNEXTPTR7:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], i32 16
+; GCN-NEXT:    store i128 [[I128:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR7]], align 8
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR8:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR7]], i32 16
 ; GCN-NEXT:    store i32 1234, ptr addrspace(1) [[PRINTBUFFNEXTPTR8]], align 4
 ; GCN-NEXT:    br label [[TMP7]]

diff  --git a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
index 708742bdec83d1c..73457654307019e 100644
--- a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
@@ -134,10 +134,10 @@ define i32 @cmpxchg_private_i32(ptr addrspace(5) %ptr) {
 
 define i64 @cmpxchg_private_i64(ptr addrspace(5) %ptr) {
 ; IR-LABEL: @cmpxchg_private_i64(
-; IR-NEXT:    [[TMP1:%.*]] = load i64, ptr addrspace(5) [[PTR:%.*]], align 4
+; IR-NEXT:    [[TMP1:%.*]] = load i64, ptr addrspace(5) [[PTR:%.*]], align 8
 ; IR-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 0
 ; IR-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i64 1, i64 [[TMP1]]
-; IR-NEXT:    store i64 [[TMP3]], ptr addrspace(5) [[PTR]], align 4
+; IR-NEXT:    store i64 [[TMP3]], ptr addrspace(5) [[PTR]], align 8
 ; IR-NEXT:    [[TMP4:%.*]] = insertvalue { i64, i1 } poison, i64 [[TMP1]], 0
 ; IR-NEXT:    [[TMP5:%.*]] = insertvalue { i64, i1 } [[TMP4]], i1 [[TMP2]], 1
 ; IR-NEXT:    [[RESULT_0:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0

diff  --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
index 68737cb227a00e1..05ee10598b21a6a 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
@@ -93,7 +93,7 @@ define amdgpu_vs void @promote_load_from_store_aggr() #0 {
 ; CHECK-NEXT:    [[FOO3_FCA_0_EXTRACT:%.*]] = extractvalue [2 x float] [[FOO3]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> undef, float [[FOO3_FCA_0_EXTRACT]], i32 0
 ; CHECK-NEXT:    [[FOO3_FCA_1_EXTRACT:%.*]] = extractvalue [2 x float] [[FOO3]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[FOO3_FCA_1_EXTRACT]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[FOO3_FCA_1_EXTRACT]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 [[FOO1]]
 ; CHECK-NEXT:    [[FOO9:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0
 ; CHECK-NEXT:    [[FOO10:%.*]] = insertelement <4 x float> [[FOO9]], float [[TMP3]], i32 1

diff  --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll
index a99c01edcc12d33..15af1f17e230ecf 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll
@@ -7,7 +7,7 @@ define amdgpu_kernel void @memset_all_zero(i64 %val) {
 ; CHECK-LABEL: @memset_all_zero(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <6 x i64> zeroinitializer, i64 [[VAL:%.*]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <6 x i64> [[TMP0]], i64 [[VAL]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <6 x i64> [[TMP0]], i64 [[VAL]], i32 1
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -24,7 +24,7 @@ define amdgpu_kernel void @memset_all_5(i64 %val) {
 ; CHECK-LABEL: @memset_all_5(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i64> <i64 361700864190383365, i64 361700864190383365, i64 361700864190383365, i64 361700864190383365>, i64 [[VAL:%.*]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i64> [[TMP0]], i64 [[VAL]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i64> [[TMP0]], i64 [[VAL]], i32 1
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -42,7 +42,7 @@ define amdgpu_kernel void @memset_volatile_nopromote(i64 %val) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[STACK:%.*]] = alloca [4 x i64], align 4, addrspace(5)
 ; CHECK-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) [[STACK]], i8 0, i64 32, i1 true)
-; CHECK-NEXT:    store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 4
+; CHECK-NEXT:    store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -57,7 +57,7 @@ define amdgpu_kernel void @memset_badsize_nopromote(i64 %val) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[STACK:%.*]] = alloca [4 x i64], align 4, addrspace(5)
 ; CHECK-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) [[STACK]], i8 0, i64 31, i1 true)
-; CHECK-NEXT:    store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 4
+; CHECK-NEXT:    store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -73,7 +73,7 @@ define amdgpu_kernel void @memset_offset_ptr_nopromote(i64 %val) {
 ; CHECK-NEXT:    [[STACK:%.*]] = alloca [4 x i64], align 4, addrspace(5)
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr [4 x i64], ptr addrspace(5) [[STACK]], i64 0, i64 1
 ; CHECK-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) [[GEP]], i8 0, i64 24, i1 true)
-; CHECK-NEXT:    store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 4
+; CHECK-NEXT:    store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:

diff  --git a/llvm/test/CodeGen/AMDGPU/rewrite-out-arguments.ll b/llvm/test/CodeGen/AMDGPU/rewrite-out-arguments.ll
index 0d35ba8e1161e9a..d946d594fde2f00 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-out-arguments.ll
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-out-arguments.ll
@@ -1,8 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-rewrite-out-arguments < %s | FileCheck %s
-; Temporarily add an explicit datalayout until https://reviews.llvm.org/D141060 lands
-target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8"
-target triple = "amdgcn-amd-amdhsa"
 
 define void @no_ret_blocks() #0 {
   unreachable

diff  --git a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1-bpfeb.ll b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1-bpfeb.ll
index ecbfec96a19b039..63a0945edf152a3 100644
--- a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1-bpfeb.ll
+++ b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1-bpfeb.ll
@@ -36,11 +36,11 @@ define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !13 {
 ; CHECK-ALU64-NEXT:  # %bb.0: # %entry
 ; CHECK-ALU64-NEXT:    #DEBUG_VALUE: test:arg <- $r1
 ; CHECK-ALU64-NEXT:  .Ltmp0:
-; CHECK-ALU64-NEXT:    r1 = 20
+; CHECK-ALU64-NEXT:    r1 = 16
 ; CHECK-ALU64-NEXT:  .Ltmp1:
 ; CHECK-ALU64-NEXT:  .Ltmp2:
 ; CHECK-ALU64-NEXT:  .Ltmp3:
-; CHECK-ALU64-NEXT:    r0 = 4
+; CHECK-ALU64-NEXT:    r0 = 8
 ; CHECK-ALU64-NEXT:  .Ltmp4:
 ; CHECK-ALU64-NEXT:    .loc 1 12 69 prologue_end # test.c:12:69
 ; CHECK-ALU64-NEXT:  .Ltmp5:
@@ -67,11 +67,11 @@ define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !13 {
 ; CHECK-ALU32-NEXT:  # %bb.0: # %entry
 ; CHECK-ALU32-NEXT:    #DEBUG_VALUE: test:arg <- $r1
 ; CHECK-ALU32-NEXT:  .Ltmp0:
-; CHECK-ALU32-NEXT:    r1 = 20
+; CHECK-ALU32-NEXT:    r1 = 16
 ; CHECK-ALU32-NEXT:  .Ltmp1:
 ; CHECK-ALU32-NEXT:  .Ltmp2:
 ; CHECK-ALU32-NEXT:  .Ltmp3:
-; CHECK-ALU32-NEXT:    r0 = 4
+; CHECK-ALU32-NEXT:    r0 = 8
 ; CHECK-ALU32-NEXT:  .Ltmp4:
 ; CHECK-ALU32-NEXT:    .loc 1 12 69 prologue_end # test.c:12:69
 ; CHECK-ALU32-NEXT:  .Ltmp5:

diff  --git a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1.ll b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1.ll
index 66a1cf291cd4c0a..33462198eab15ac 100644
--- a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1.ll
@@ -36,18 +36,18 @@ define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !13 {
 ; CHECK-ALU64-NEXT:  # %bb.0: # %entry
 ; CHECK-ALU64-NEXT:    #DEBUG_VALUE: test:arg <- $r1
 ; CHECK-ALU64-NEXT:  .Ltmp0:
-; CHECK-ALU64-NEXT:    r1 = 20
+; CHECK-ALU64-NEXT:    r1 = 16
 ; CHECK-ALU64-NEXT:  .Ltmp1:
 ; CHECK-ALU64-NEXT:  .Ltmp2:
 ; CHECK-ALU64-NEXT:  .Ltmp3:
-; CHECK-ALU64-NEXT:    r0 = 4
+; CHECK-ALU64-NEXT:    r0 = 8
 ; CHECK-ALU64-NEXT:  .Ltmp4:
 ; CHECK-ALU64-NEXT:    .loc 1 12 69 prologue_end # test.c:12:69
 ; CHECK-ALU64-NEXT:  .Ltmp5:
 ; CHECK-ALU64-NEXT:  .Ltmp6:
 ; CHECK-ALU64-NEXT:    r0 += r1
 ; CHECK-ALU64-NEXT:  .Ltmp7:
-; CHECK-ALU64-NEXT:    r1 = 50
+; CHECK-ALU64-NEXT:    r1 = 18
 ; CHECK-ALU64-NEXT:    .loc 1 13 67 # test.c:13:67
 ; CHECK-ALU64-NEXT:  .Ltmp8:
 ; CHECK-ALU64-NEXT:    r0 += r1
@@ -67,18 +67,18 @@ define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !13 {
 ; CHECK-ALU32-NEXT:  # %bb.0: # %entry
 ; CHECK-ALU32-NEXT:    #DEBUG_VALUE: test:arg <- $r1
 ; CHECK-ALU32-NEXT:  .Ltmp0:
-; CHECK-ALU32-NEXT:    r1 = 20
+; CHECK-ALU32-NEXT:    r1 = 16
 ; CHECK-ALU32-NEXT:  .Ltmp1:
 ; CHECK-ALU32-NEXT:  .Ltmp2:
 ; CHECK-ALU32-NEXT:  .Ltmp3:
-; CHECK-ALU32-NEXT:    r0 = 4
+; CHECK-ALU32-NEXT:    r0 = 8
 ; CHECK-ALU32-NEXT:  .Ltmp4:
 ; CHECK-ALU32-NEXT:    .loc 1 12 69 prologue_end # test.c:12:69
 ; CHECK-ALU32-NEXT:  .Ltmp5:
 ; CHECK-ALU32-NEXT:  .Ltmp6:
 ; CHECK-ALU32-NEXT:    w0 += w1
 ; CHECK-ALU32-NEXT:  .Ltmp7:
-; CHECK-ALU32-NEXT:    r1 = 50
+; CHECK-ALU32-NEXT:    r1 = 18
 ; CHECK-ALU32-NEXT:    .loc 1 13 67 # test.c:13:67
 ; CHECK-ALU32-NEXT:  .Ltmp8:
 ; CHECK-ALU32-NEXT:    w0 += w1

diff  --git a/llvm/test/CodeGen/X86/expand-large-div-rem-sdiv129.ll b/llvm/test/CodeGen/X86/expand-large-div-rem-sdiv129.ll
index 2b592fdbf0d299c..61c8f8863489812 100644
--- a/llvm/test/CodeGen/X86/expand-large-div-rem-sdiv129.ll
+++ b/llvm/test/CodeGen/X86/expand-large-div-rem-sdiv129.ll
@@ -4,7 +4,7 @@
 define void @sdiv129(ptr %ptr, ptr %out) nounwind {
 ; CHECK-LABEL: @sdiv129(
 ; CHECK-NEXT:  _udiv-special-cases:
-; CHECK-NEXT:    [[A:%.*]] = load i129, ptr [[PTR:%.*]], align 4
+; CHECK-NEXT:    [[A:%.*]] = load i129, ptr [[PTR:%.*]], align 16
 ; CHECK-NEXT:    [[TMP0:%.*]] = freeze i129 [[A]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = freeze i129 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = ashr i129 [[TMP0]], 128
@@ -66,7 +66,7 @@ define void @sdiv129(ptr %ptr, ptr %out) nounwind {
 ; CHECK-NEXT:    [[TMP48:%.*]] = phi i129 [ [[TMP25]], [[UDIV_LOOP_EXIT]] ], [ [[TMP20]], [[_UDIV_SPECIAL_CASES:%.*]] ]
 ; CHECK-NEXT:    [[TMP49:%.*]] = xor i129 [[TMP48]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP50:%.*]] = sub i129 [[TMP49]], [[TMP8]]
-; CHECK-NEXT:    store i129 [[TMP50]], ptr [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i129 [[TMP50]], ptr [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %a = load i129, ptr %ptr

diff  --git a/llvm/test/CodeGen/X86/expand-large-div-rem-srem129.ll b/llvm/test/CodeGen/X86/expand-large-div-rem-srem129.ll
index f0f410def9641ae..eddb2095c40f582 100644
--- a/llvm/test/CodeGen/X86/expand-large-div-rem-srem129.ll
+++ b/llvm/test/CodeGen/X86/expand-large-div-rem-srem129.ll
@@ -4,7 +4,7 @@
 define void @test(ptr %ptr, ptr %out) nounwind {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  _udiv-special-cases:
-; CHECK-NEXT:    [[A:%.*]] = load i129, ptr [[PTR:%.*]], align 4
+; CHECK-NEXT:    [[A:%.*]] = load i129, ptr [[PTR:%.*]], align 16
 ; CHECK-NEXT:    [[TMP0:%.*]] = freeze i129 [[A]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = freeze i129 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = ashr i129 [[TMP0]], 128
@@ -69,7 +69,7 @@ define void @test(ptr %ptr, ptr %out) nounwind {
 ; CHECK-NEXT:    [[TMP51:%.*]] = sub i129 [[TMP8]], [[TMP50]]
 ; CHECK-NEXT:    [[TMP52:%.*]] = xor i129 [[TMP51]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP53:%.*]] = sub i129 [[TMP52]], [[TMP2]]
-; CHECK-NEXT:    store i129 [[TMP53]], ptr [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i129 [[TMP53]], ptr [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %a = load i129, ptr %ptr

diff  --git a/llvm/test/CodeGen/X86/expand-large-div-rem-udiv129.ll b/llvm/test/CodeGen/X86/expand-large-div-rem-udiv129.ll
index 79b527ec4f98968..78bce5c398cadfc 100644
--- a/llvm/test/CodeGen/X86/expand-large-div-rem-udiv129.ll
+++ b/llvm/test/CodeGen/X86/expand-large-div-rem-udiv129.ll
@@ -4,7 +4,7 @@
 define void @test(ptr %ptr, ptr %out) nounwind {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  _udiv-special-cases:
-; CHECK-NEXT:    [[A:%.*]] = load i129, ptr [[PTR:%.*]], align 4
+; CHECK-NEXT:    [[A:%.*]] = load i129, ptr [[PTR:%.*]], align 16
 ; CHECK-NEXT:    [[TMP0:%.*]] = freeze i129 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = freeze i129 [[A]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i129 [[TMP0]], 0
@@ -55,7 +55,7 @@ define void @test(ptr %ptr, ptr %out) nounwind {
 ; CHECK-NEXT:    br i1 [[TMP38]], label [[UDIV_LOOP_EXIT]], label [[UDIV_PREHEADER]]
 ; CHECK:       udiv-end:
 ; CHECK-NEXT:    [[TMP39:%.*]] = phi i129 [ [[TMP16]], [[UDIV_LOOP_EXIT]] ], [ [[TMP11]], [[_UDIV_SPECIAL_CASES:%.*]] ]
-; CHECK-NEXT:    store i129 [[TMP39]], ptr [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i129 [[TMP39]], ptr [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %a = load i129, ptr %ptr

diff  --git a/llvm/test/CodeGen/X86/expand-large-div-rem-urem129.ll b/llvm/test/CodeGen/X86/expand-large-div-rem-urem129.ll
index 134875b00d7176a..ebe75fcf23c4bcf 100644
--- a/llvm/test/CodeGen/X86/expand-large-div-rem-urem129.ll
+++ b/llvm/test/CodeGen/X86/expand-large-div-rem-urem129.ll
@@ -4,7 +4,7 @@
 define void @test(ptr %ptr, ptr %out) nounwind {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  _udiv-special-cases:
-; CHECK-NEXT:    [[A:%.*]] = load i129, ptr [[PTR:%.*]], align 4
+; CHECK-NEXT:    [[A:%.*]] = load i129, ptr [[PTR:%.*]], align 16
 ; CHECK-NEXT:    [[TMP0:%.*]] = freeze i129 [[A]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = freeze i129 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = freeze i129 [[TMP1]]
@@ -59,7 +59,7 @@ define void @test(ptr %ptr, ptr %out) nounwind {
 ; CHECK-NEXT:    [[TMP41:%.*]] = phi i129 [ [[TMP18]], [[UDIV_LOOP_EXIT]] ], [ [[TMP13]], [[_UDIV_SPECIAL_CASES:%.*]] ]
 ; CHECK-NEXT:    [[TMP42:%.*]] = mul i129 [[TMP1]], [[TMP41]]
 ; CHECK-NEXT:    [[TMP43:%.*]] = sub i129 [[TMP0]], [[TMP42]]
-; CHECK-NEXT:    store i129 [[TMP43]], ptr [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i129 [[TMP43]], ptr [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %a = load i129, ptr %ptr

diff  --git a/llvm/test/Instrumentation/HWAddressSanitizer/use-after-scope.ll b/llvm/test/Instrumentation/HWAddressSanitizer/use-after-scope.ll
index 105ce5197fa6e9e..c678164f1f36b4f 100644
--- a/llvm/test/Instrumentation/HWAddressSanitizer/use-after-scope.ll
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/use-after-scope.ll
@@ -80,7 +80,7 @@ define dso_local i32 @standard_lifetime() local_unnamed_addr sanitize_hwaddress
 ; AARCH64-SCOPE-LABEL: @standard_lifetime(
 ; AARCH64-SCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-SCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-SCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-SCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1:![0-9]+]])
 ; AARCH64-SCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -88,13 +88,13 @@ define dso_local i32 @standard_lifetime() local_unnamed_addr sanitize_hwaddress
 ; AARCH64-SCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-SCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-SCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-SCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-SCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-SCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-SCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-SCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-SCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-SCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-SCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -132,7 +132,7 @@ define dso_local i32 @standard_lifetime() local_unnamed_addr sanitize_hwaddress
 ; AARCH64-NOSCOPE-LABEL: @standard_lifetime(
 ; AARCH64-NOSCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-NOSCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-NOSCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1:![0-9]+]])
 ; AARCH64-NOSCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -140,13 +140,13 @@ define dso_local i32 @standard_lifetime() local_unnamed_addr sanitize_hwaddress
 ; AARCH64-NOSCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-NOSCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-NOSCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-NOSCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-NOSCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-NOSCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-NOSCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-NOSCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -182,7 +182,7 @@ define dso_local i32 @standard_lifetime() local_unnamed_addr sanitize_hwaddress
 ; AARCH64-SHORT-SCOPE-LABEL: @standard_lifetime(
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-SHORT-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-SHORT-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1:![0-9]+]])
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -190,13 +190,13 @@ define dso_local i32 @standard_lifetime() local_unnamed_addr sanitize_hwaddress
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-SHORT-SCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -237,7 +237,7 @@ define dso_local i32 @standard_lifetime() local_unnamed_addr sanitize_hwaddress
 ; AARCH64-SHORT-NOSCOPE-LABEL: @standard_lifetime(
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1:![0-9]+]])
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -245,13 +245,13 @@ define dso_local i32 @standard_lifetime() local_unnamed_addr sanitize_hwaddress
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -361,7 +361,7 @@ define dso_local i32 @standard_lifetime_optnone() local_unnamed_addr optnone noi
 ; AARCH64-SCOPE-LABEL: @standard_lifetime_optnone(
 ; AARCH64-SCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-SCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-SCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-SCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]])
 ; AARCH64-SCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -369,13 +369,13 @@ define dso_local i32 @standard_lifetime_optnone() local_unnamed_addr optnone noi
 ; AARCH64-SCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-SCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-SCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-SCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-SCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-SCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-SCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-SCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-SCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-SCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-SCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -413,7 +413,7 @@ define dso_local i32 @standard_lifetime_optnone() local_unnamed_addr optnone noi
 ; AARCH64-NOSCOPE-LABEL: @standard_lifetime_optnone(
 ; AARCH64-NOSCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-NOSCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-NOSCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]])
 ; AARCH64-NOSCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -421,13 +421,13 @@ define dso_local i32 @standard_lifetime_optnone() local_unnamed_addr optnone noi
 ; AARCH64-NOSCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-NOSCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-NOSCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-NOSCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-NOSCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-NOSCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-NOSCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-NOSCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -463,7 +463,7 @@ define dso_local i32 @standard_lifetime_optnone() local_unnamed_addr optnone noi
 ; AARCH64-SHORT-SCOPE-LABEL: @standard_lifetime_optnone(
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-SHORT-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-SHORT-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]])
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -471,13 +471,13 @@ define dso_local i32 @standard_lifetime_optnone() local_unnamed_addr optnone noi
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-SHORT-SCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -518,7 +518,7 @@ define dso_local i32 @standard_lifetime_optnone() local_unnamed_addr optnone noi
 ; AARCH64-SHORT-NOSCOPE-LABEL: @standard_lifetime_optnone(
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]])
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -526,13 +526,13 @@ define dso_local i32 @standard_lifetime_optnone() local_unnamed_addr optnone noi
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -632,7 +632,7 @@ define dso_local i32 @multiple_lifetimes() local_unnamed_addr sanitize_hwaddress
 ; AARCH64-SCOPE-LABEL: @multiple_lifetimes(
 ; AARCH64-SCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-SCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-SCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-SCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]])
 ; AARCH64-SCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -640,13 +640,13 @@ define dso_local i32 @multiple_lifetimes() local_unnamed_addr sanitize_hwaddress
 ; AARCH64-SCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-SCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-SCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-SCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-SCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-SCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-SCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-SCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-SCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-SCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-SCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -678,7 +678,7 @@ define dso_local i32 @multiple_lifetimes() local_unnamed_addr sanitize_hwaddress
 ; AARCH64-NOSCOPE-LABEL: @multiple_lifetimes(
 ; AARCH64-NOSCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-NOSCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-NOSCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]])
 ; AARCH64-NOSCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -686,13 +686,13 @@ define dso_local i32 @multiple_lifetimes() local_unnamed_addr sanitize_hwaddress
 ; AARCH64-NOSCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-NOSCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-NOSCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-NOSCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-NOSCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-NOSCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-NOSCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-NOSCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -724,7 +724,7 @@ define dso_local i32 @multiple_lifetimes() local_unnamed_addr sanitize_hwaddress
 ; AARCH64-SHORT-SCOPE-LABEL: @multiple_lifetimes(
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-SHORT-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-SHORT-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]])
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -732,13 +732,13 @@ define dso_local i32 @multiple_lifetimes() local_unnamed_addr sanitize_hwaddress
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-SHORT-SCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -773,7 +773,7 @@ define dso_local i32 @multiple_lifetimes() local_unnamed_addr sanitize_hwaddress
 ; AARCH64-SHORT-NOSCOPE-LABEL: @multiple_lifetimes(
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]])
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -781,13 +781,13 @@ define dso_local i32 @multiple_lifetimes() local_unnamed_addr sanitize_hwaddress
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -892,7 +892,7 @@ define dso_local i32 @unreachable_exit() local_unnamed_addr sanitize_hwaddress {
 ; AARCH64-SCOPE-LABEL: @unreachable_exit(
 ; AARCH64-SCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-SCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-SCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-SCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]])
 ; AARCH64-SCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -900,13 +900,13 @@ define dso_local i32 @unreachable_exit() local_unnamed_addr sanitize_hwaddress {
 ; AARCH64-SCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-SCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-SCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-SCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-SCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-SCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-SCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-SCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-SCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-SCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-SCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -949,7 +949,7 @@ define dso_local i32 @unreachable_exit() local_unnamed_addr sanitize_hwaddress {
 ; AARCH64-NOSCOPE-LABEL: @unreachable_exit(
 ; AARCH64-NOSCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-NOSCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-NOSCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]])
 ; AARCH64-NOSCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -957,13 +957,13 @@ define dso_local i32 @unreachable_exit() local_unnamed_addr sanitize_hwaddress {
 ; AARCH64-NOSCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-NOSCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-NOSCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-NOSCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-NOSCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-NOSCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-NOSCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-NOSCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -1005,7 +1005,7 @@ define dso_local i32 @unreachable_exit() local_unnamed_addr sanitize_hwaddress {
 ; AARCH64-SHORT-SCOPE-LABEL: @unreachable_exit(
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-SHORT-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-SHORT-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]])
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -1013,13 +1013,13 @@ define dso_local i32 @unreachable_exit() local_unnamed_addr sanitize_hwaddress {
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-SHORT-SCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -1065,7 +1065,7 @@ define dso_local i32 @unreachable_exit() local_unnamed_addr sanitize_hwaddress {
 ; AARCH64-SHORT-NOSCOPE-LABEL: @unreachable_exit(
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]])
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -1073,13 +1073,13 @@ define dso_local i32 @unreachable_exit() local_unnamed_addr sanitize_hwaddress {
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -1200,7 +1200,7 @@ define dso_local i32 @diamond_lifetime() local_unnamed_addr sanitize_hwaddress {
 ; AARCH64-SCOPE-LABEL: @diamond_lifetime(
 ; AARCH64-SCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-SCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-SCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-SCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]])
 ; AARCH64-SCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -1208,13 +1208,13 @@ define dso_local i32 @diamond_lifetime() local_unnamed_addr sanitize_hwaddress {
 ; AARCH64-SCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-SCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-SCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-SCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-SCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-SCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-SCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-SCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-SCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-SCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-SCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -1261,7 +1261,7 @@ define dso_local i32 @diamond_lifetime() local_unnamed_addr sanitize_hwaddress {
 ; AARCH64-NOSCOPE-LABEL: @diamond_lifetime(
 ; AARCH64-NOSCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-NOSCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-NOSCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]])
 ; AARCH64-NOSCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -1269,13 +1269,13 @@ define dso_local i32 @diamond_lifetime() local_unnamed_addr sanitize_hwaddress {
 ; AARCH64-NOSCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-NOSCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-NOSCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-NOSCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-NOSCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-NOSCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-NOSCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-NOSCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -1313,7 +1313,7 @@ define dso_local i32 @diamond_lifetime() local_unnamed_addr sanitize_hwaddress {
 ; AARCH64-SHORT-SCOPE-LABEL: @diamond_lifetime(
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-SHORT-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-SHORT-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]])
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -1321,13 +1321,13 @@ define dso_local i32 @diamond_lifetime() local_unnamed_addr sanitize_hwaddress {
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-SHORT-SCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -1377,7 +1377,7 @@ define dso_local i32 @diamond_lifetime() local_unnamed_addr sanitize_hwaddress {
 ; AARCH64-SHORT-NOSCOPE-LABEL: @diamond_lifetime(
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]])
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -1385,13 +1385,13 @@ define dso_local i32 @diamond_lifetime() local_unnamed_addr sanitize_hwaddress {
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr

diff  --git a/llvm/test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_32.ll b/llvm/test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_32.ll
index 1fcb6047797e38d..2ba72d37b70f058 100644
--- a/llvm/test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_32.ll
+++ b/llvm/test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_32.ll
@@ -6,14 +6,14 @@ target triple = "i386-unknown-linux-gnu"
 define i32 @foo() #0 {
 ; CHECK-LABEL: define i32 @foo() comdat {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @__sanitizer_cov_trace_pc_guard(ptr @__sancov_gen_) #[[ATTR1:[0-9]+]]
+; CHECK-NEXT:    call void @__sanitizer_cov_trace_pc_guard(ptr inttoptr (i32 ptrtoint (ptr @__sancov_gen_ to i32) to ptr)) #[[ATTR1:[0-9]+]]
 ; CHECK-NEXT:    ret i32 0
 ;
 entry:
   ret i32 0
 }
 
-; CHECK-DAG: declare void @__sanitizer_cov_trace_pc_indir(i64)
+; CHECK-DAG: declare void @__sanitizer_cov_trace_pc_indir(i32)
 ; CHECK-DAG: declare void @__sanitizer_cov_trace_cmp1(i8 zeroext, i8 zeroext)
 ; CHECK-DAG: declare void @__sanitizer_cov_trace_cmp2(i16 zeroext, i16 zeroext)
 ; CHECK-DAG: declare void @__sanitizer_cov_trace_cmp4(i32 zeroext, i32 zeroext)
@@ -24,7 +24,7 @@ entry:
 ; CHECK-DAG: declare void @__sanitizer_cov_trace_const_cmp8(i64, i64)
 ; CHECK-DAG: declare void @__sanitizer_cov_trace_div4(i32 zeroext)
 ; CHECK-DAG: declare void @__sanitizer_cov_trace_div8(i64)
-; CHECK-DAG: declare void @__sanitizer_cov_trace_gep(i64)
+; CHECK-DAG: declare void @__sanitizer_cov_trace_gep(i32)
 ; CHECK-DAG: declare void @__sanitizer_cov_trace_switch(i64, ptr)
 ; CHECK-DAG: declare void @__sanitizer_cov_trace_pc()
 ; CHECK-DAG: declare void @__sanitizer_cov_trace_pc_guard(ptr)

diff  --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll
index ad57ca640a515de..6a6e416bdbc89de 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll
@@ -1,96 +1,172 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -atomic-expand %s | FileCheck %s
-; RUN: opt -mtriple=r600-mesa-mesa3d -S -atomic-expand %s | FileCheck %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -atomic-expand %s | FileCheck %s --check-prefixes=CHECK,GCN
+; RUN: opt -mtriple=r600-mesa-mesa3d -S -atomic-expand %s | FileCheck %s --check-prefixes=CHECK,R600
 
 define i8 @test_atomicrmw_xchg_i8_global_agent(ptr addrspace(1) %ptr, i8 %value) {
-; CHECK-LABEL: @test_atomicrmw_xchg_i8_global_agent(
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
-; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
-; CHECK-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; CHECK-NEXT:    [[TMP6:%.*]] = or i32 [[TMP5]], [[VALOPERAND_SHIFTED]]
-; CHECK-NEXT:    [[TMP7:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP6]] syncscope("agent") seq_cst seq_cst, align 4
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP7]], 1
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; CHECK-NEXT:    ret i8 [[EXTRACTED]]
+; GCN-LABEL: @test_atomicrmw_xchg_i8_global_agent(
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
+; GCN-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]]
+; GCN-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; GCN-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GCN:       atomicrmw.start:
+; GCN-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; GCN-NEXT:    [[TMP5:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; GCN-NEXT:    [[TMP6:%.*]] = or i32 [[TMP5]], [[VALOPERAND_SHIFTED]]
+; GCN-NEXT:    [[TMP7:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP6]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP7]], 1
+; GCN-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP7]], 0
+; GCN-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GCN:       atomicrmw.end:
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT:    ret i8 [[EXTRACTED]]
+;
+; R600-LABEL: @test_atomicrmw_xchg_i8_global_agent(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
+; R600-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[TMP2]]
+; R600-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[TMP5:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT:    [[TMP6:%.*]] = or i32 [[TMP5]], [[VALOPERAND_SHIFTED]]
+; R600-NEXT:    [[TMP7:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP6]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP7]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP7]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED]]
 ;
   %res = atomicrmw xchg ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst
   ret i8 %res
 }
 
 define i8 @test_atomicrmw_add_i8_global_agent(ptr addrspace(1) %ptr, i8 %value) {
-; CHECK-LABEL: @test_atomicrmw_add_i8_global_agent(
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
-; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
-; CHECK-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[NEW:%.*]] = add i32 [[LOADED]], [[VALOPERAND_SHIFTED]]
-; CHECK-NEXT:    [[TMP5:%.*]] = and i32 [[NEW]], [[MASK]]
-; CHECK-NEXT:    [[TMP6:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP7]] syncscope("agent") seq_cst seq_cst, align 4
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; CHECK-NEXT:    ret i8 [[EXTRACTED]]
+; GCN-LABEL: @test_atomicrmw_add_i8_global_agent(
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
+; GCN-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]]
+; GCN-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; GCN-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GCN:       atomicrmw.start:
+; GCN-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; GCN-NEXT:    [[NEW:%.*]] = add i32 [[LOADED]], [[VALOPERAND_SHIFTED]]
+; GCN-NEXT:    [[TMP5:%.*]] = and i32 [[NEW]], [[MASK]]
+; GCN-NEXT:    [[TMP6:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; GCN-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], [[TMP5]]
+; GCN-NEXT:    [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP7]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
+; GCN-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; GCN-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GCN:       atomicrmw.end:
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT:    ret i8 [[EXTRACTED]]
+;
+; R600-LABEL: @test_atomicrmw_add_i8_global_agent(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
+; R600-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[TMP2]]
+; R600-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[NEW:%.*]] = add i32 [[LOADED]], [[VALOPERAND_SHIFTED]]
+; R600-NEXT:    [[TMP5:%.*]] = and i32 [[NEW]], [[MASK]]
+; R600-NEXT:    [[TMP6:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], [[TMP5]]
+; R600-NEXT:    [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP7]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED]]
 ;
   %res = atomicrmw add ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst
   ret i8 %res
 }
 
 define i8 @test_atomicrmw_add_i8_global_agent_align2(ptr addrspace(1) %ptr, i8 %value) {
-; CHECK-LABEL: @test_atomicrmw_add_i8_global_agent_align2(
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
-; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
-; CHECK-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[NEW:%.*]] = add i32 [[LOADED]], [[VALOPERAND_SHIFTED]]
-; CHECK-NEXT:    [[TMP5:%.*]] = and i32 [[NEW]], [[MASK]]
-; CHECK-NEXT:    [[TMP6:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP7]] syncscope("agent") seq_cst seq_cst, align 4
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; CHECK-NEXT:    ret i8 [[EXTRACTED]]
+; GCN-LABEL: @test_atomicrmw_add_i8_global_agent_align2(
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
+; GCN-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]]
+; GCN-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; GCN-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GCN:       atomicrmw.start:
+; GCN-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; GCN-NEXT:    [[NEW:%.*]] = add i32 [[LOADED]], [[VALOPERAND_SHIFTED]]
+; GCN-NEXT:    [[TMP5:%.*]] = and i32 [[NEW]], [[MASK]]
+; GCN-NEXT:    [[TMP6:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; GCN-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], [[TMP5]]
+; GCN-NEXT:    [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP7]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
+; GCN-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; GCN-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GCN:       atomicrmw.end:
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT:    ret i8 [[EXTRACTED]]
+;
+; R600-LABEL: @test_atomicrmw_add_i8_global_agent_align2(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
+; R600-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[TMP2]]
+; R600-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[NEW:%.*]] = add i32 [[LOADED]], [[VALOPERAND_SHIFTED]]
+; R600-NEXT:    [[TMP5:%.*]] = and i32 [[NEW]], [[MASK]]
+; R600-NEXT:    [[TMP6:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], [[TMP5]]
+; R600-NEXT:    [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP7]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED]]
 ;
   %res = atomicrmw add ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst, align 2
   ret i8 %res
@@ -120,303 +196,546 @@ define i8 @test_atomicrmw_add_i8_global_agent_align4(ptr addrspace(1) %ptr, i8 %
 }
 
 define i8 @test_atomicrmw_sub_i8_global_agent(ptr addrspace(1) %ptr, i8 %value) {
-; CHECK-LABEL: @test_atomicrmw_sub_i8_global_agent(
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
-; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
-; CHECK-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[NEW:%.*]] = sub i32 [[LOADED]], [[VALOPERAND_SHIFTED]]
-; CHECK-NEXT:    [[TMP5:%.*]] = and i32 [[NEW]], [[MASK]]
-; CHECK-NEXT:    [[TMP6:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP7]] syncscope("agent") seq_cst seq_cst, align 4
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; CHECK-NEXT:    ret i8 [[EXTRACTED]]
+; GCN-LABEL: @test_atomicrmw_sub_i8_global_agent(
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
+; GCN-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]]
+; GCN-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; GCN-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GCN:       atomicrmw.start:
+; GCN-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; GCN-NEXT:    [[NEW:%.*]] = sub i32 [[LOADED]], [[VALOPERAND_SHIFTED]]
+; GCN-NEXT:    [[TMP5:%.*]] = and i32 [[NEW]], [[MASK]]
+; GCN-NEXT:    [[TMP6:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; GCN-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], [[TMP5]]
+; GCN-NEXT:    [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP7]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
+; GCN-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; GCN-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GCN:       atomicrmw.end:
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT:    ret i8 [[EXTRACTED]]
+;
+; R600-LABEL: @test_atomicrmw_sub_i8_global_agent(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
+; R600-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[TMP2]]
+; R600-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[NEW:%.*]] = sub i32 [[LOADED]], [[VALOPERAND_SHIFTED]]
+; R600-NEXT:    [[TMP5:%.*]] = and i32 [[NEW]], [[MASK]]
+; R600-NEXT:    [[TMP6:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], [[TMP5]]
+; R600-NEXT:    [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP7]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED]]
 ;
   %res = atomicrmw sub ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst
   ret i8 %res
 }
 
 define i8 @test_atomicrmw_and_i8_global_agent(ptr addrspace(1) %ptr, i8 %value) {
-; CHECK-LABEL: @test_atomicrmw_and_i8_global_agent(
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
-; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
-; CHECK-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[ANDOPERAND:%.*]] = or i32 [[INV_MASK]], [[VALOPERAND_SHIFTED]]
-; CHECK-NEXT:    [[TMP4:%.*]] = atomicrmw and ptr addrspace(1) [[ALIGNEDADDR]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; CHECK-NEXT:    ret i8 [[EXTRACTED]]
+; GCN-LABEL: @test_atomicrmw_and_i8_global_agent(
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
+; GCN-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]]
+; GCN-NEXT:    [[ANDOPERAND:%.*]] = or i32 [[INV_MASK]], [[VALOPERAND_SHIFTED]]
+; GCN-NEXT:    [[TMP4:%.*]] = atomicrmw and ptr addrspace(1) [[ALIGNEDADDR]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT:    ret i8 [[EXTRACTED]]
+;
+; R600-LABEL: @test_atomicrmw_and_i8_global_agent(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
+; R600-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[TMP2]]
+; R600-NEXT:    [[ANDOPERAND:%.*]] = or i32 [[INV_MASK]], [[VALOPERAND_SHIFTED]]
+; R600-NEXT:    [[TMP4:%.*]] = atomicrmw and ptr addrspace(1) [[ALIGNEDADDR]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED]]
 ;
   %res = atomicrmw and ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst
   ret i8 %res
 }
 
 define i8 @test_atomicrmw_nand_i8_global_agent(ptr addrspace(1) %ptr, i8 %value) {
-; CHECK-LABEL: @test_atomicrmw_nand_i8_global_agent(
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
-; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
-; CHECK-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = and i32 [[LOADED]], [[VALOPERAND_SHIFTED]]
-; CHECK-NEXT:    [[NEW:%.*]] = xor i32 [[TMP5]], -1
-; CHECK-NEXT:    [[TMP6:%.*]] = and i32 [[NEW]], [[MASK]]
-; CHECK-NEXT:    [[TMP7:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; CHECK-NEXT:    [[TMP8:%.*]] = or i32 [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP8]] syncscope("agent") seq_cst seq_cst, align 4
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP9]], 1
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP9]], 0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; CHECK-NEXT:    ret i8 [[EXTRACTED]]
+; GCN-LABEL: @test_atomicrmw_nand_i8_global_agent(
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
+; GCN-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]]
+; GCN-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; GCN-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GCN:       atomicrmw.start:
+; GCN-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; GCN-NEXT:    [[TMP5:%.*]] = and i32 [[LOADED]], [[VALOPERAND_SHIFTED]]
+; GCN-NEXT:    [[NEW:%.*]] = xor i32 [[TMP5]], -1
+; GCN-NEXT:    [[TMP6:%.*]] = and i32 [[NEW]], [[MASK]]
+; GCN-NEXT:    [[TMP7:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; GCN-NEXT:    [[TMP8:%.*]] = or i32 [[TMP7]], [[TMP6]]
+; GCN-NEXT:    [[TMP9:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP8]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP9]], 1
+; GCN-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP9]], 0
+; GCN-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GCN:       atomicrmw.end:
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT:    ret i8 [[EXTRACTED]]
+;
+; R600-LABEL: @test_atomicrmw_nand_i8_global_agent(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
+; R600-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[TMP2]]
+; R600-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[TMP5:%.*]] = and i32 [[LOADED]], [[VALOPERAND_SHIFTED]]
+; R600-NEXT:    [[NEW:%.*]] = xor i32 [[TMP5]], -1
+; R600-NEXT:    [[TMP6:%.*]] = and i32 [[NEW]], [[MASK]]
+; R600-NEXT:    [[TMP7:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT:    [[TMP8:%.*]] = or i32 [[TMP7]], [[TMP6]]
+; R600-NEXT:    [[TMP9:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP8]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP9]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP9]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED]]
 ;
   %res = atomicrmw nand ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst
   ret i8 %res
 }
 
 define i8 @test_atomicrmw_or_i8_global_agent(ptr addrspace(1) %ptr, i8 %value) {
-; CHECK-LABEL: @test_atomicrmw_or_i8_global_agent(
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
-; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
-; CHECK-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[TMP4:%.*]] = atomicrmw or ptr addrspace(1) [[ALIGNEDADDR]], i32 [[VALOPERAND_SHIFTED]] syncscope("agent") seq_cst, align 4
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; CHECK-NEXT:    ret i8 [[EXTRACTED]]
+; GCN-LABEL: @test_atomicrmw_or_i8_global_agent(
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
+; GCN-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]]
+; GCN-NEXT:    [[TMP4:%.*]] = atomicrmw or ptr addrspace(1) [[ALIGNEDADDR]], i32 [[VALOPERAND_SHIFTED]] syncscope("agent") seq_cst, align 4
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT:    ret i8 [[EXTRACTED]]
+;
+; R600-LABEL: @test_atomicrmw_or_i8_global_agent(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
+; R600-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[TMP2]]
+; R600-NEXT:    [[TMP4:%.*]] = atomicrmw or ptr addrspace(1) [[ALIGNEDADDR]], i32 [[VALOPERAND_SHIFTED]] syncscope("agent") seq_cst, align 4
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED]]
 ;
   %res = atomicrmw or ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst
   ret i8 %res
 }
 
 define i8 @test_atomicrmw_xor_i8_global_agent(ptr addrspace(1) %ptr, i8 %value) {
-; CHECK-LABEL: @test_atomicrmw_xor_i8_global_agent(
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
-; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
-; CHECK-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[TMP4:%.*]] = atomicrmw xor ptr addrspace(1) [[ALIGNEDADDR]], i32 [[VALOPERAND_SHIFTED]] syncscope("agent") seq_cst, align 4
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; CHECK-NEXT:    ret i8 [[EXTRACTED]]
+; GCN-LABEL: @test_atomicrmw_xor_i8_global_agent(
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
+; GCN-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]]
+; GCN-NEXT:    [[TMP4:%.*]] = atomicrmw xor ptr addrspace(1) [[ALIGNEDADDR]], i32 [[VALOPERAND_SHIFTED]] syncscope("agent") seq_cst, align 4
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT:    ret i8 [[EXTRACTED]]
+;
+; R600-LABEL: @test_atomicrmw_xor_i8_global_agent(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
+; R600-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[TMP2]]
+; R600-NEXT:    [[TMP4:%.*]] = atomicrmw xor ptr addrspace(1) [[ALIGNEDADDR]], i32 [[VALOPERAND_SHIFTED]] syncscope("agent") seq_cst, align 4
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED]]
 ;
   %res = atomicrmw xor ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst
   ret i8 %res
 }
 
 define i8 @test_atomicrmw_max_i8_global_agent(ptr addrspace(1) %ptr, i8 %value) {
-; CHECK-LABEL: @test_atomicrmw_max_i8_global_agent(
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
-; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp sgt i8 [[EXTRACTED]], [[VALUE:%.*]]
-; CHECK-NEXT:    [[NEW:%.*]] = select i1 [[TMP4]], i8 [[EXTRACTED]], i8 [[VALUE]]
-; CHECK-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
-; CHECK-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; CHECK-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
-; CHECK-NEXT:    ret i8 [[EXTRACTED3]]
+; GCN-LABEL: @test_atomicrmw_max_i8_global_agent(
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; GCN-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GCN:       atomicrmw.start:
+; GCN-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT:    [[TMP4:%.*]] = icmp sgt i8 [[EXTRACTED]], [[VALUE:%.*]]
+; GCN-NEXT:    [[NEW:%.*]] = select i1 [[TMP4]], i8 [[EXTRACTED]], i8 [[VALUE]]
+; GCN-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; GCN-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; GCN-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; GCN-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GCN-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GCN-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GCN:       atomicrmw.end:
+; GCN-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; GCN-NEXT:    ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_max_i8_global_agent(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    [[TMP4:%.*]] = icmp sgt i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT:    [[NEW:%.*]] = select i1 [[TMP4]], i8 [[EXTRACTED]], i8 [[VALUE]]
+; R600-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED3]]
 ;
   %res = atomicrmw max ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst
   ret i8 %res
 }
 
 define i8 @test_atomicrmw_min_i8_global_agent(ptr addrspace(1) %ptr, i8 %value) {
-; CHECK-LABEL: @test_atomicrmw_min_i8_global_agent(
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
-; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp sle i8 [[EXTRACTED]], [[VALUE:%.*]]
-; CHECK-NEXT:    [[NEW:%.*]] = select i1 [[TMP4]], i8 [[EXTRACTED]], i8 [[VALUE]]
-; CHECK-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
-; CHECK-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; CHECK-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
-; CHECK-NEXT:    ret i8 [[EXTRACTED3]]
+; GCN-LABEL: @test_atomicrmw_min_i8_global_agent(
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; GCN-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GCN:       atomicrmw.start:
+; GCN-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT:    [[TMP4:%.*]] = icmp sle i8 [[EXTRACTED]], [[VALUE:%.*]]
+; GCN-NEXT:    [[NEW:%.*]] = select i1 [[TMP4]], i8 [[EXTRACTED]], i8 [[VALUE]]
+; GCN-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; GCN-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; GCN-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; GCN-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GCN-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GCN-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GCN:       atomicrmw.end:
+; GCN-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; GCN-NEXT:    ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_min_i8_global_agent(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    [[TMP4:%.*]] = icmp sle i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT:    [[NEW:%.*]] = select i1 [[TMP4]], i8 [[EXTRACTED]], i8 [[VALUE]]
+; R600-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED3]]
 ;
   %res = atomicrmw min ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst
   ret i8 %res
 }
 
 define i8 @test_atomicrmw_umax_i8_global_agent(ptr addrspace(1) %ptr, i8 %value) {
-; CHECK-LABEL: @test_atomicrmw_umax_i8_global_agent(
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
-; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ugt i8 [[EXTRACTED]], [[VALUE:%.*]]
-; CHECK-NEXT:    [[NEW:%.*]] = select i1 [[TMP4]], i8 [[EXTRACTED]], i8 [[VALUE]]
-; CHECK-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
-; CHECK-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; CHECK-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
-; CHECK-NEXT:    ret i8 [[EXTRACTED3]]
+; GCN-LABEL: @test_atomicrmw_umax_i8_global_agent(
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; GCN-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GCN:       atomicrmw.start:
+; GCN-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT:    [[TMP4:%.*]] = icmp ugt i8 [[EXTRACTED]], [[VALUE:%.*]]
+; GCN-NEXT:    [[NEW:%.*]] = select i1 [[TMP4]], i8 [[EXTRACTED]], i8 [[VALUE]]
+; GCN-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; GCN-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; GCN-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; GCN-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GCN-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GCN-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GCN:       atomicrmw.end:
+; GCN-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; GCN-NEXT:    ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_umax_i8_global_agent(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    [[TMP4:%.*]] = icmp ugt i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT:    [[NEW:%.*]] = select i1 [[TMP4]], i8 [[EXTRACTED]], i8 [[VALUE]]
+; R600-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED3]]
 ;
   %res = atomicrmw umax ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst
   ret i8 %res
 }
 
 define i8 @test_atomicrmw_umin_i8_global_agent(ptr addrspace(1) %ptr, i8 %value) {
-; CHECK-LABEL: @test_atomicrmw_umin_i8_global_agent(
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
-; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ule i8 [[EXTRACTED]], [[VALUE:%.*]]
-; CHECK-NEXT:    [[NEW:%.*]] = select i1 [[TMP4]], i8 [[EXTRACTED]], i8 [[VALUE]]
-; CHECK-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
-; CHECK-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; CHECK-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
-; CHECK-NEXT:    ret i8 [[EXTRACTED3]]
+; GCN-LABEL: @test_atomicrmw_umin_i8_global_agent(
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; GCN-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GCN:       atomicrmw.start:
+; GCN-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT:    [[TMP4:%.*]] = icmp ule i8 [[EXTRACTED]], [[VALUE:%.*]]
+; GCN-NEXT:    [[NEW:%.*]] = select i1 [[TMP4]], i8 [[EXTRACTED]], i8 [[VALUE]]
+; GCN-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; GCN-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; GCN-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; GCN-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GCN-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GCN-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GCN:       atomicrmw.end:
+; GCN-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; GCN-NEXT:    ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_umin_i8_global_agent(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    [[TMP4:%.*]] = icmp ule i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT:    [[NEW:%.*]] = select i1 [[TMP4]], i8 [[EXTRACTED]], i8 [[VALUE]]
+; R600-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED3]]
 ;
   %res = atomicrmw umin ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst
   ret i8 %res
 }
 
 define i8 @test_cmpxchg_i8_global_agent(ptr addrspace(1) %out, i8 %in, i8 %old) {
-; CHECK-LABEL: @test_cmpxchg_i8_global_agent(
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT:%.*]], i64 4
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[GEP]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[GEP]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
-; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[IN:%.*]] to i32
-; CHECK-NEXT:    [[TMP4:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[TMP5:%.*]] = zext i8 [[OLD:%.*]] to i32
-; CHECK-NEXT:    [[TMP6:%.*]] = shl i32 [[TMP5]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = and i32 [[TMP7]], [[INV_MASK]]
-; CHECK-NEXT:    br label [[PARTWORD_CMPXCHG_LOOP:%.*]]
-; CHECK:       partword.cmpxchg.loop:
-; CHECK-NEXT:    [[TMP9:%.*]] = phi i32 [ [[TMP8]], [[TMP0:%.*]] ], [ [[TMP15:%.*]], [[PARTWORD_CMPXCHG_FAILURE:%.*]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = or i32 [[TMP9]], [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or i32 [[TMP9]], [[TMP6]]
-; CHECK-NEXT:    [[TMP12:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[TMP11]], i32 [[TMP10]] syncscope("agent") seq_cst seq_cst, align 4
-; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { i32, i1 } [[TMP12]], 0
-; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { i32, i1 } [[TMP12]], 1
-; CHECK-NEXT:    br i1 [[TMP14]], label [[PARTWORD_CMPXCHG_END:%.*]], label [[PARTWORD_CMPXCHG_FAILURE]]
-; CHECK:       partword.cmpxchg.failure:
-; CHECK-NEXT:    [[TMP15]] = and i32 [[TMP13]], [[INV_MASK]]
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP9]], [[TMP15]]
-; CHECK-NEXT:    br i1 [[TMP16]], label [[PARTWORD_CMPXCHG_LOOP]], label [[PARTWORD_CMPXCHG_END]]
-; CHECK:       partword.cmpxchg.end:
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[TMP13]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; CHECK-NEXT:    [[TMP17:%.*]] = insertvalue { i8, i1 } poison, i8 [[EXTRACTED]], 0
-; CHECK-NEXT:    [[TMP18:%.*]] = insertvalue { i8, i1 } [[TMP17]], i1 [[TMP14]], 1
-; CHECK-NEXT:    [[EXTRACT:%.*]] = extractvalue { i8, i1 } [[TMP18]], 0
-; CHECK-NEXT:    ret i8 [[EXTRACT]]
+; GCN-LABEL: @test_cmpxchg_i8_global_agent(
+; GCN-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT:%.*]], i64 4
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[GEP]], i64 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[GEP]] to i64
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT:    [[TMP3:%.*]] = zext i8 [[IN:%.*]] to i32
+; GCN-NEXT:    [[TMP4:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]]
+; GCN-NEXT:    [[TMP5:%.*]] = zext i8 [[OLD:%.*]] to i32
+; GCN-NEXT:    [[TMP6:%.*]] = shl i32 [[TMP5]], [[SHIFTAMT]]
+; GCN-NEXT:    [[TMP7:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; GCN-NEXT:    [[TMP8:%.*]] = and i32 [[TMP7]], [[INV_MASK]]
+; GCN-NEXT:    br label [[PARTWORD_CMPXCHG_LOOP:%.*]]
+; GCN:       partword.cmpxchg.loop:
+; GCN-NEXT:    [[TMP9:%.*]] = phi i32 [ [[TMP8]], [[TMP0:%.*]] ], [ [[TMP15:%.*]], [[PARTWORD_CMPXCHG_FAILURE:%.*]] ]
+; GCN-NEXT:    [[TMP10:%.*]] = or i32 [[TMP9]], [[TMP4]]
+; GCN-NEXT:    [[TMP11:%.*]] = or i32 [[TMP9]], [[TMP6]]
+; GCN-NEXT:    [[TMP12:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[TMP11]], i32 [[TMP10]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT:    [[TMP13:%.*]] = extractvalue { i32, i1 } [[TMP12]], 0
+; GCN-NEXT:    [[TMP14:%.*]] = extractvalue { i32, i1 } [[TMP12]], 1
+; GCN-NEXT:    br i1 [[TMP14]], label [[PARTWORD_CMPXCHG_END:%.*]], label [[PARTWORD_CMPXCHG_FAILURE]]
+; GCN:       partword.cmpxchg.failure:
+; GCN-NEXT:    [[TMP15]] = and i32 [[TMP13]], [[INV_MASK]]
+; GCN-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP9]], [[TMP15]]
+; GCN-NEXT:    br i1 [[TMP16]], label [[PARTWORD_CMPXCHG_LOOP]], label [[PARTWORD_CMPXCHG_END]]
+; GCN:       partword.cmpxchg.end:
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[TMP13]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT:    [[TMP17:%.*]] = insertvalue { i8, i1 } poison, i8 [[EXTRACTED]], 0
+; GCN-NEXT:    [[TMP18:%.*]] = insertvalue { i8, i1 } [[TMP17]], i1 [[TMP14]], 1
+; GCN-NEXT:    [[EXTRACT:%.*]] = extractvalue { i8, i1 } [[TMP18]], 0
+; GCN-NEXT:    ret i8 [[EXTRACT]]
+;
+; R600-LABEL: @test_cmpxchg_i8_global_agent(
+; R600-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT:%.*]], i64 4
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[GEP]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[GEP]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = zext i8 [[IN:%.*]] to i32
+; R600-NEXT:    [[TMP4:%.*]] = shl i32 [[TMP3]], [[TMP2]]
+; R600-NEXT:    [[TMP5:%.*]] = zext i8 [[OLD:%.*]] to i32
+; R600-NEXT:    [[TMP6:%.*]] = shl i32 [[TMP5]], [[TMP2]]
+; R600-NEXT:    [[TMP7:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; R600-NEXT:    [[TMP8:%.*]] = and i32 [[TMP7]], [[INV_MASK]]
+; R600-NEXT:    br label [[PARTWORD_CMPXCHG_LOOP:%.*]]
+; R600:       partword.cmpxchg.loop:
+; R600-NEXT:    [[TMP9:%.*]] = phi i32 [ [[TMP8]], [[TMP0:%.*]] ], [ [[TMP15:%.*]], [[PARTWORD_CMPXCHG_FAILURE:%.*]] ]
+; R600-NEXT:    [[TMP10:%.*]] = or i32 [[TMP9]], [[TMP4]]
+; R600-NEXT:    [[TMP11:%.*]] = or i32 [[TMP9]], [[TMP6]]
+; R600-NEXT:    [[TMP12:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[TMP11]], i32 [[TMP10]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[TMP13:%.*]] = extractvalue { i32, i1 } [[TMP12]], 0
+; R600-NEXT:    [[TMP14:%.*]] = extractvalue { i32, i1 } [[TMP12]], 1
+; R600-NEXT:    br i1 [[TMP14]], label [[PARTWORD_CMPXCHG_END:%.*]], label [[PARTWORD_CMPXCHG_FAILURE]]
+; R600:       partword.cmpxchg.failure:
+; R600-NEXT:    [[TMP15]] = and i32 [[TMP13]], [[INV_MASK]]
+; R600-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP9]], [[TMP15]]
+; R600-NEXT:    br i1 [[TMP16]], label [[PARTWORD_CMPXCHG_LOOP]], label [[PARTWORD_CMPXCHG_END]]
+; R600:       partword.cmpxchg.end:
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[TMP13]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    [[TMP17:%.*]] = insertvalue { i8, i1 } poison, i8 [[EXTRACTED]], 0
+; R600-NEXT:    [[TMP18:%.*]] = insertvalue { i8, i1 } [[TMP17]], i1 [[TMP14]], 1
+; R600-NEXT:    [[EXTRACT:%.*]] = extractvalue { i8, i1 } [[TMP18]], 0
+; R600-NEXT:    ret i8 [[EXTRACT]]
 ;
   %gep = getelementptr i8, ptr addrspace(1) %out, i64 4
   %res = cmpxchg ptr addrspace(1) %gep, i8 %old, i8 %in syncscope("agent") seq_cst seq_cst
@@ -427,17 +746,16 @@ define i8 @test_cmpxchg_i8_global_agent(ptr addrspace(1) %out, i8 %in, i8 %old)
 define i8 @test_cmpxchg_i8_local_align2(ptr addrspace(3) %out, i8 %in, i8 %old) {
 ; CHECK-LABEL: @test_cmpxchg_i8_local_align2(
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr addrspace(3) [[OUT:%.*]], i64 4
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[GEP]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[GEP]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[GEP]], i32 -4)
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[GEP]] to i32
+; CHECK-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
 ; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[IN:%.*]] to i32
-; CHECK-NEXT:    [[TMP4:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shl i32 [[TMP3]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = zext i8 [[OLD:%.*]] to i32
-; CHECK-NEXT:    [[TMP6:%.*]] = shl i32 [[TMP5]], [[SHIFTAMT]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shl i32 [[TMP5]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = and i32 [[TMP7]], [[INV_MASK]]
 ; CHECK-NEXT:    br label [[PARTWORD_CMPXCHG_LOOP:%.*]]
@@ -454,7 +772,7 @@ define i8 @test_cmpxchg_i8_local_align2(ptr addrspace(3) %out, i8 %in, i8 %old)
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP9]], [[TMP15]]
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[PARTWORD_CMPXCHG_LOOP]], label [[PARTWORD_CMPXCHG_END]]
 ; CHECK:       partword.cmpxchg.end:
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[TMP13]], [[SHIFTAMT]]
+; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[TMP13]], [[TMP2]]
 ; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
 ; CHECK-NEXT:    [[TMP17:%.*]] = insertvalue { i8, i1 } poison, i8 [[EXTRACTED]], 0
 ; CHECK-NEXT:    [[TMP18:%.*]] = insertvalue { i8, i1 } [[TMP17]], i1 [[TMP14]], 1
@@ -468,70 +786,128 @@ define i8 @test_cmpxchg_i8_local_align2(ptr addrspace(3) %out, i8 %in, i8 %old)
 }
 
 define i8 @test_atomicrmw_inc_i8_global_agent(ptr addrspace(1) %ptr, i8 %value) {
-; CHECK-LABEL: @test_atomicrmw_inc_i8_global_agent(
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
-; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; CHECK-NEXT:    [[TMP4:%.*]] = add i8 [[EXTRACTED]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
-; CHECK-NEXT:    [[NEW:%.*]] = select i1 [[TMP5]], i8 0, i8 [[TMP4]]
-; CHECK-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
-; CHECK-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; CHECK-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
-; CHECK-NEXT:    ret i8 [[EXTRACTED3]]
+; GCN-LABEL: @test_atomicrmw_inc_i8_global_agent(
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; GCN-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GCN:       atomicrmw.start:
+; GCN-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT:    [[TMP4:%.*]] = add i8 [[EXTRACTED]], 1
+; GCN-NEXT:    [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; GCN-NEXT:    [[NEW:%.*]] = select i1 [[TMP5]], i8 0, i8 [[TMP4]]
+; GCN-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; GCN-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; GCN-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; GCN-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; GCN-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; GCN-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GCN:       atomicrmw.end:
+; GCN-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; GCN-NEXT:    ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_inc_i8_global_agent(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    [[TMP4:%.*]] = add i8 [[EXTRACTED]], 1
+; R600-NEXT:    [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT:    [[NEW:%.*]] = select i1 [[TMP5]], i8 0, i8 [[TMP4]]
+; R600-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED3]]
 ;
   %res = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst
   ret i8 %res
 }
 
 define i8 @test_atomicrmw_inc_i8_global_agent_align2(ptr addrspace(1) %ptr, i8 %value) {
-; CHECK-LABEL: @test_atomicrmw_inc_i8_global_agent_align2(
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
-; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; CHECK-NEXT:    [[TMP4:%.*]] = add i8 [[EXTRACTED]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
-; CHECK-NEXT:    [[NEW:%.*]] = select i1 [[TMP5]], i8 0, i8 [[TMP4]]
-; CHECK-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
-; CHECK-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; CHECK-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
-; CHECK-NEXT:    ret i8 [[EXTRACTED3]]
+; GCN-LABEL: @test_atomicrmw_inc_i8_global_agent_align2(
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; GCN-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GCN:       atomicrmw.start:
+; GCN-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT:    [[TMP4:%.*]] = add i8 [[EXTRACTED]], 1
+; GCN-NEXT:    [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; GCN-NEXT:    [[NEW:%.*]] = select i1 [[TMP5]], i8 0, i8 [[TMP4]]
+; GCN-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; GCN-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; GCN-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; GCN-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; GCN-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; GCN-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GCN:       atomicrmw.end:
+; GCN-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; GCN-NEXT:    ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_inc_i8_global_agent_align2(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    [[TMP4:%.*]] = add i8 [[EXTRACTED]], 1
+; R600-NEXT:    [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT:    [[NEW:%.*]] = select i1 [[TMP5]], i8 0, i8 [[TMP4]]
+; R600-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED3]]
 ;
   %res = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst, align 2
   ret i8 %res
@@ -564,24 +940,23 @@ define i8 @test_atomicrmw_inc_i8_global_agent_align4(ptr addrspace(1) %ptr, i8 %
 
 define i8 @test_atomicrmw_inc_i8_local(ptr addrspace(3) %ptr, i8 %value) {
 ; CHECK-LABEL: @test_atomicrmw_inc_i8_local(
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; CHECK-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
 ; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CHECK:       atomicrmw.start:
 ; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i8 [[EXTRACTED]], 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
 ; CHECK-NEXT:    [[NEW:%.*]] = select i1 [[TMP5]], i8 0, i8 [[TMP4]]
 ; CHECK-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
-; CHECK-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; CHECK-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; CHECK-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; CHECK-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
@@ -589,7 +964,7 @@ define i8 @test_atomicrmw_inc_i8_local(ptr addrspace(3) %ptr, i8 %value) {
 ; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; CHECK-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
 ; CHECK-NEXT:    ret i8 [[EXTRACTED3]]
 ;
@@ -599,24 +974,23 @@ define i8 @test_atomicrmw_inc_i8_local(ptr addrspace(3) %ptr, i8 %value) {
 
 define i8 @test_atomicrmw_inc_i8_local_align2(ptr addrspace(3) %ptr, i8 %value) {
 ; CHECK-LABEL: @test_atomicrmw_inc_i8_local_align2(
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; CHECK-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
 ; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CHECK:       atomicrmw.start:
 ; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i8 [[EXTRACTED]], 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
 ; CHECK-NEXT:    [[NEW:%.*]] = select i1 [[TMP5]], i8 0, i8 [[TMP4]]
 ; CHECK-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
-; CHECK-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; CHECK-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; CHECK-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; CHECK-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
@@ -624,7 +998,7 @@ define i8 @test_atomicrmw_inc_i8_local_align2(ptr addrspace(3) %ptr, i8 %value)
 ; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; CHECK-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
 ; CHECK-NEXT:    ret i8 [[EXTRACTED3]]
 ;
@@ -658,70 +1032,128 @@ define i8 @test_atomicrmw_inc_i8_local_align4(ptr addrspace(3) %ptr, i8 %value)
 }
 
 define i8 @test_atomicrmw_inc_i8_flat_agent(ptr %ptr, i8 %value) {
-; CHECK-LABEL: @test_atomicrmw_inc_i8_flat_agent(
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
-; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; CHECK-NEXT:    [[TMP4:%.*]] = add i8 [[EXTRACTED]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
-; CHECK-NEXT:    [[NEW:%.*]] = select i1 [[TMP5]], i8 0, i8 [[TMP4]]
-; CHECK-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
-; CHECK-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; CHECK-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
-; CHECK-NEXT:    ret i8 [[EXTRACTED3]]
+; GCN-LABEL: @test_atomicrmw_inc_i8_flat_agent(
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
+; GCN-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GCN:       atomicrmw.start:
+; GCN-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT:    [[TMP4:%.*]] = add i8 [[EXTRACTED]], 1
+; GCN-NEXT:    [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; GCN-NEXT:    [[NEW:%.*]] = select i1 [[TMP5]], i8 0, i8 [[TMP4]]
+; GCN-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; GCN-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; GCN-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; GCN-NEXT:    [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; GCN-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; GCN-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GCN:       atomicrmw.end:
+; GCN-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; GCN-NEXT:    ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_inc_i8_flat_agent(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i32(ptr [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    [[TMP4:%.*]] = add i8 [[EXTRACTED]], 1
+; R600-NEXT:    [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT:    [[NEW:%.*]] = select i1 [[TMP5]], i8 0, i8 [[TMP4]]
+; R600-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT:    [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED3]]
 ;
   %res = atomicrmw uinc_wrap ptr %ptr, i8 %value syncscope("agent") seq_cst
   ret i8 %res
 }
 
 define i8 @test_atomicrmw_inc_i8_flat_agent_align2(ptr %ptr, i8 %value) {
-; CHECK-LABEL: @test_atomicrmw_inc_i8_flat_agent_align2(
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
-; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; CHECK-NEXT:    [[TMP4:%.*]] = add i8 [[EXTRACTED]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
-; CHECK-NEXT:    [[NEW:%.*]] = select i1 [[TMP5]], i8 0, i8 [[TMP4]]
-; CHECK-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
-; CHECK-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; CHECK-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
-; CHECK-NEXT:    ret i8 [[EXTRACTED3]]
+; GCN-LABEL: @test_atomicrmw_inc_i8_flat_agent_align2(
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
+; GCN-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GCN:       atomicrmw.start:
+; GCN-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT:    [[TMP4:%.*]] = add i8 [[EXTRACTED]], 1
+; GCN-NEXT:    [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; GCN-NEXT:    [[NEW:%.*]] = select i1 [[TMP5]], i8 0, i8 [[TMP4]]
+; GCN-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; GCN-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; GCN-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; GCN-NEXT:    [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; GCN-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; GCN-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GCN:       atomicrmw.end:
+; GCN-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; GCN-NEXT:    ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_inc_i8_flat_agent_align2(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i32(ptr [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    [[TMP4:%.*]] = add i8 [[EXTRACTED]], 1
+; R600-NEXT:    [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT:    [[NEW:%.*]] = select i1 [[TMP5]], i8 0, i8 [[TMP4]]
+; R600-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT:    [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED3]]
 ;
   %res = atomicrmw uinc_wrap ptr %ptr, i8 %value syncscope("agent") seq_cst, align 2
   ret i8 %res
@@ -753,74 +1185,136 @@ define i8 @test_atomicrmw_inc_i8_flat_agent_align4(ptr %ptr, i8 %value) {
 }
 
 define i8 @test_atomicrmw_dec_i8_global_agent(ptr addrspace(1) %ptr, i8 %value) {
-; CHECK-LABEL: @test_atomicrmw_dec_i8_global_agent(
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
-; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; CHECK-NEXT:    [[TMP4:%.*]] = sub i8 [[EXTRACTED]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i8 [[EXTRACTED]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ugt i8 [[EXTRACTED]], [[VALUE:%.*]]
-; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[NEW:%.*]] = select i1 [[TMP7]], i8 [[VALUE]], i8 [[TMP4]]
-; CHECK-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
-; CHECK-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; CHECK-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
-; CHECK-NEXT:    ret i8 [[EXTRACTED3]]
+; GCN-LABEL: @test_atomicrmw_dec_i8_global_agent(
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; GCN-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GCN:       atomicrmw.start:
+; GCN-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT:    [[TMP4:%.*]] = sub i8 [[EXTRACTED]], 1
+; GCN-NEXT:    [[TMP5:%.*]] = icmp eq i8 [[EXTRACTED]], 0
+; GCN-NEXT:    [[TMP6:%.*]] = icmp ugt i8 [[EXTRACTED]], [[VALUE:%.*]]
+; GCN-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; GCN-NEXT:    [[NEW:%.*]] = select i1 [[TMP7]], i8 [[VALUE]], i8 [[TMP4]]
+; GCN-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; GCN-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; GCN-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; GCN-NEXT:    [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
+; GCN-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; GCN-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GCN:       atomicrmw.end:
+; GCN-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; GCN-NEXT:    ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_dec_i8_global_agent(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    [[TMP4:%.*]] = sub i8 [[EXTRACTED]], 1
+; R600-NEXT:    [[TMP5:%.*]] = icmp eq i8 [[EXTRACTED]], 0
+; R600-NEXT:    [[TMP6:%.*]] = icmp ugt i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; R600-NEXT:    [[NEW:%.*]] = select i1 [[TMP7]], i8 [[VALUE]], i8 [[TMP4]]
+; R600-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT:    [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED3]]
 ;
   %res = atomicrmw udec_wrap ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst
   ret i8 %res
 }
 
 define i8 @test_atomicrmw_dec_i8_global_agent_align2(ptr addrspace(1) %ptr, i8 %value) {
-; CHECK-LABEL: @test_atomicrmw_dec_i8_global_agent_align2(
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
-; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; CHECK-NEXT:    [[TMP4:%.*]] = sub i8 [[EXTRACTED]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i8 [[EXTRACTED]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ugt i8 [[EXTRACTED]], [[VALUE:%.*]]
-; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[NEW:%.*]] = select i1 [[TMP7]], i8 [[VALUE]], i8 [[TMP4]]
-; CHECK-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
-; CHECK-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; CHECK-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
-; CHECK-NEXT:    ret i8 [[EXTRACTED3]]
+; GCN-LABEL: @test_atomicrmw_dec_i8_global_agent_align2(
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; GCN-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GCN:       atomicrmw.start:
+; GCN-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT:    [[TMP4:%.*]] = sub i8 [[EXTRACTED]], 1
+; GCN-NEXT:    [[TMP5:%.*]] = icmp eq i8 [[EXTRACTED]], 0
+; GCN-NEXT:    [[TMP6:%.*]] = icmp ugt i8 [[EXTRACTED]], [[VALUE:%.*]]
+; GCN-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; GCN-NEXT:    [[NEW:%.*]] = select i1 [[TMP7]], i8 [[VALUE]], i8 [[TMP4]]
+; GCN-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; GCN-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; GCN-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; GCN-NEXT:    [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
+; GCN-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; GCN-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GCN:       atomicrmw.end:
+; GCN-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; GCN-NEXT:    ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_dec_i8_global_agent_align2(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    [[TMP4:%.*]] = sub i8 [[EXTRACTED]], 1
+; R600-NEXT:    [[TMP5:%.*]] = icmp eq i8 [[EXTRACTED]], 0
+; R600-NEXT:    [[TMP6:%.*]] = icmp ugt i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; R600-NEXT:    [[NEW:%.*]] = select i1 [[TMP7]], i8 [[VALUE]], i8 [[TMP4]]
+; R600-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT:    [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED3]]
 ;
   %res = atomicrmw udec_wrap ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst, align 2
   ret i8 %res
@@ -855,18 +1349,17 @@ define i8 @test_atomicrmw_dec_i8_global_agent_align4(ptr addrspace(1) %ptr, i8 %
 
 define i8 @test_atomicrmw_dec_i8_local(ptr addrspace(3) %ptr, i8 %value) {
 ; CHECK-LABEL: @test_atomicrmw_dec_i8_local(
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; CHECK-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
 ; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CHECK:       atomicrmw.start:
 ; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub i8 [[EXTRACTED]], 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i8 [[EXTRACTED]], 0
@@ -874,7 +1367,7 @@ define i8 @test_atomicrmw_dec_i8_local(ptr addrspace(3) %ptr, i8 %value) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    [[NEW:%.*]] = select i1 [[TMP7]], i8 [[VALUE]], i8 [[TMP4]]
 ; CHECK-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
-; CHECK-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; CHECK-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; CHECK-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; CHECK-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
@@ -882,7 +1375,7 @@ define i8 @test_atomicrmw_dec_i8_local(ptr addrspace(3) %ptr, i8 %value) {
 ; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
 ; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; CHECK-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
 ; CHECK-NEXT:    ret i8 [[EXTRACTED3]]
 ;
@@ -892,18 +1385,17 @@ define i8 @test_atomicrmw_dec_i8_local(ptr addrspace(3) %ptr, i8 %value) {
 
 define i8 @test_atomicrmw_dec_i8_local_align2(ptr addrspace(3) %ptr, i8 %value) {
 ; CHECK-LABEL: @test_atomicrmw_dec_i8_local_align2(
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; CHECK-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
 ; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CHECK:       atomicrmw.start:
 ; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub i8 [[EXTRACTED]], 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i8 [[EXTRACTED]], 0
@@ -911,7 +1403,7 @@ define i8 @test_atomicrmw_dec_i8_local_align2(ptr addrspace(3) %ptr, i8 %value)
 ; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    [[NEW:%.*]] = select i1 [[TMP7]], i8 [[VALUE]], i8 [[TMP4]]
 ; CHECK-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
-; CHECK-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; CHECK-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; CHECK-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; CHECK-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
@@ -919,7 +1411,7 @@ define i8 @test_atomicrmw_dec_i8_local_align2(ptr addrspace(3) %ptr, i8 %value)
 ; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
 ; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; CHECK-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
 ; CHECK-NEXT:    ret i8 [[EXTRACTED3]]
 ;
@@ -955,74 +1447,136 @@ define i8 @test_atomicrmw_dec_i8_local_align4(ptr addrspace(3) %ptr, i8 %value)
 }
 
 define i8 @test_atomicrmw_dec_i8_flat_agent(ptr %ptr, i8 %value) {
-; CHECK-LABEL: @test_atomicrmw_dec_i8_flat_agent(
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
-; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; CHECK-NEXT:    [[TMP4:%.*]] = sub i8 [[EXTRACTED]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i8 [[EXTRACTED]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ugt i8 [[EXTRACTED]], [[VALUE:%.*]]
-; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[NEW:%.*]] = select i1 [[TMP7]], i8 [[VALUE]], i8 [[TMP4]]
-; CHECK-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
-; CHECK-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; CHECK-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
-; CHECK-NEXT:    ret i8 [[EXTRACTED3]]
+; GCN-LABEL: @test_atomicrmw_dec_i8_flat_agent(
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
+; GCN-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GCN:       atomicrmw.start:
+; GCN-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT:    [[TMP4:%.*]] = sub i8 [[EXTRACTED]], 1
+; GCN-NEXT:    [[TMP5:%.*]] = icmp eq i8 [[EXTRACTED]], 0
+; GCN-NEXT:    [[TMP6:%.*]] = icmp ugt i8 [[EXTRACTED]], [[VALUE:%.*]]
+; GCN-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; GCN-NEXT:    [[NEW:%.*]] = select i1 [[TMP7]], i8 [[VALUE]], i8 [[TMP4]]
+; GCN-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; GCN-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; GCN-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; GCN-NEXT:    [[TMP8:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
+; GCN-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; GCN-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GCN:       atomicrmw.end:
+; GCN-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; GCN-NEXT:    ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_dec_i8_flat_agent(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i32(ptr [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    [[TMP4:%.*]] = sub i8 [[EXTRACTED]], 1
+; R600-NEXT:    [[TMP5:%.*]] = icmp eq i8 [[EXTRACTED]], 0
+; R600-NEXT:    [[TMP6:%.*]] = icmp ugt i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; R600-NEXT:    [[NEW:%.*]] = select i1 [[TMP7]], i8 [[VALUE]], i8 [[TMP4]]
+; R600-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT:    [[TMP8:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED3]]
 ;
   %res = atomicrmw udec_wrap ptr %ptr, i8 %value syncscope("agent") seq_cst
   ret i8 %res
 }
 
 define i8 @test_atomicrmw_dec_i8_flat_agent_align2(ptr %ptr, i8 %value) {
-; CHECK-LABEL: @test_atomicrmw_dec_i8_flat_agent_align2(
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
-; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; CHECK-NEXT:    [[TMP4:%.*]] = sub i8 [[EXTRACTED]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i8 [[EXTRACTED]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ugt i8 [[EXTRACTED]], [[VALUE:%.*]]
-; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[NEW:%.*]] = select i1 [[TMP7]], i8 [[VALUE]], i8 [[TMP4]]
-; CHECK-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
-; CHECK-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; CHECK-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
-; CHECK-NEXT:    ret i8 [[EXTRACTED3]]
+; GCN-LABEL: @test_atomicrmw_dec_i8_flat_agent_align2(
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
+; GCN-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GCN:       atomicrmw.start:
+; GCN-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT:    [[TMP4:%.*]] = sub i8 [[EXTRACTED]], 1
+; GCN-NEXT:    [[TMP5:%.*]] = icmp eq i8 [[EXTRACTED]], 0
+; GCN-NEXT:    [[TMP6:%.*]] = icmp ugt i8 [[EXTRACTED]], [[VALUE:%.*]]
+; GCN-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; GCN-NEXT:    [[NEW:%.*]] = select i1 [[TMP7]], i8 [[VALUE]], i8 [[TMP4]]
+; GCN-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; GCN-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; GCN-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; GCN-NEXT:    [[TMP8:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
+; GCN-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; GCN-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GCN:       atomicrmw.end:
+; GCN-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; GCN-NEXT:    ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_dec_i8_flat_agent_align2(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i32(ptr [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    [[TMP4:%.*]] = sub i8 [[EXTRACTED]], 1
+; R600-NEXT:    [[TMP5:%.*]] = icmp eq i8 [[EXTRACTED]], 0
+; R600-NEXT:    [[TMP6:%.*]] = icmp ugt i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; R600-NEXT:    [[NEW:%.*]] = select i1 [[TMP7]], i8 [[VALUE]], i8 [[TMP4]]
+; R600-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT:    [[TMP8:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED3]]
 ;
   %res = atomicrmw udec_wrap ptr %ptr, i8 %value syncscope("agent") seq_cst, align 2
   ret i8 %res

diff  --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
index 93e43030040d35b..97c041168d147b5 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
@@ -1279,24 +1279,23 @@ define half @test_atomicrmw_fadd_f16_global_align4(ptr addrspace(1) %ptr, half %
 
 define half @test_atomicrmw_fadd_f16_local(ptr addrspace(3) %ptr, half %value) {
 ; CI-LABEL: @test_atomicrmw_fadd_f16_local(
-; CI-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; CI-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; CI-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CI-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CI-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CI-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; CI-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; CI-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; CI-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; CI-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; CI-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; CI-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; CI-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; CI-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CI:       atomicrmw.start:
 ; CI-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CI-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; CI-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; CI-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; CI-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half
 ; CI-NEXT:    [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]]
 ; CI-NEXT:    [[TMP5:%.*]] = bitcast half [[NEW]] to i16
 ; CI-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; CI-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; CI-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; CI-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; CI-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; CI-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
@@ -1304,30 +1303,29 @@ define half @test_atomicrmw_fadd_f16_local(ptr addrspace(3) %ptr, half %value) {
 ; CI-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; CI-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; CI:       atomicrmw.end:
-; CI-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; CI-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; CI-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; CI-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half
 ; CI-NEXT:    ret half [[TMP7]]
 ;
 ; GFX9-LABEL: @test_atomicrmw_fadd_f16_local(
-; GFX9-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GFX9-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GFX9-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX9-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX9-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX9-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GFX9-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GFX9-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GFX9-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GFX9-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GFX9-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GFX9-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GFX9-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GFX9-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GFX9:       atomicrmw.start:
 ; GFX9-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX9-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GFX9-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GFX9-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GFX9-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half
 ; GFX9-NEXT:    [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]]
 ; GFX9-NEXT:    [[TMP5:%.*]] = bitcast half [[NEW]] to i16
 ; GFX9-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX9-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GFX9-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GFX9-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GFX9-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GFX9-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
@@ -1335,30 +1333,29 @@ define half @test_atomicrmw_fadd_f16_local(ptr addrspace(3) %ptr, half %value) {
 ; GFX9-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; GFX9-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GFX9:       atomicrmw.end:
-; GFX9-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GFX9-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GFX9-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GFX9-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half
 ; GFX9-NEXT:    ret half [[TMP7]]
 ;
 ; GFX908-LABEL: @test_atomicrmw_fadd_f16_local(
-; GFX908-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GFX908-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GFX908-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX908-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX908-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX908-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GFX908-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GFX908-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GFX908-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GFX908-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GFX908-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GFX908-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GFX908-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GFX908:       atomicrmw.start:
 ; GFX908-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX908-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GFX908-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GFX908-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GFX908-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half
 ; GFX908-NEXT:    [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]]
 ; GFX908-NEXT:    [[TMP5:%.*]] = bitcast half [[NEW]] to i16
 ; GFX908-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX908-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GFX908-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GFX908-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GFX908-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GFX908-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
@@ -1366,30 +1363,29 @@ define half @test_atomicrmw_fadd_f16_local(ptr addrspace(3) %ptr, half %value) {
 ; GFX908-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GFX908:       atomicrmw.end:
-; GFX908-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GFX908-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GFX908-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GFX908-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half
 ; GFX908-NEXT:    ret half [[TMP7]]
 ;
 ; GFX90A-LABEL: @test_atomicrmw_fadd_f16_local(
-; GFX90A-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GFX90A-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GFX90A-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX90A-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX90A-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX90A-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GFX90A-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GFX90A-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GFX90A-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GFX90A-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GFX90A-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GFX90A-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GFX90A-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GFX90A:       atomicrmw.start:
 ; GFX90A-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX90A-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GFX90A-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GFX90A-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half
 ; GFX90A-NEXT:    [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]]
 ; GFX90A-NEXT:    [[TMP5:%.*]] = bitcast half [[NEW]] to i16
 ; GFX90A-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX90A-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GFX90A-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GFX90A-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GFX90A-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GFX90A-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
@@ -1397,30 +1393,29 @@ define half @test_atomicrmw_fadd_f16_local(ptr addrspace(3) %ptr, half %value) {
 ; GFX90A-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GFX90A:       atomicrmw.end:
-; GFX90A-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GFX90A-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GFX90A-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GFX90A-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half
 ; GFX90A-NEXT:    ret half [[TMP7]]
 ;
 ; GFX940-LABEL: @test_atomicrmw_fadd_f16_local(
-; GFX940-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GFX940-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GFX940-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX940-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX940-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX940-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GFX940-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GFX940-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GFX940-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GFX940-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GFX940-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GFX940-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GFX940-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GFX940:       atomicrmw.start:
 ; GFX940-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX940-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GFX940-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GFX940-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GFX940-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half
 ; GFX940-NEXT:    [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]]
 ; GFX940-NEXT:    [[TMP5:%.*]] = bitcast half [[NEW]] to i16
 ; GFX940-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX940-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GFX940-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GFX940-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GFX940-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GFX940-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
@@ -1428,30 +1423,29 @@ define half @test_atomicrmw_fadd_f16_local(ptr addrspace(3) %ptr, half %value) {
 ; GFX940-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GFX940:       atomicrmw.end:
-; GFX940-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GFX940-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GFX940-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GFX940-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half
 ; GFX940-NEXT:    ret half [[TMP7]]
 ;
 ; GFX11-LABEL: @test_atomicrmw_fadd_f16_local(
-; GFX11-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GFX11-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GFX11-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX11-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX11-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX11-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GFX11-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GFX11-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GFX11-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GFX11-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GFX11-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GFX11-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GFX11-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GFX11:       atomicrmw.start:
 ; GFX11-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX11-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GFX11-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GFX11-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GFX11-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half
 ; GFX11-NEXT:    [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]]
 ; GFX11-NEXT:    [[TMP5:%.*]] = bitcast half [[NEW]] to i16
 ; GFX11-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX11-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GFX11-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GFX11-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GFX11-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GFX11-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
@@ -1459,7 +1453,7 @@ define half @test_atomicrmw_fadd_f16_local(ptr addrspace(3) %ptr, half %value) {
 ; GFX11-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GFX11:       atomicrmw.end:
-; GFX11-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GFX11-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GFX11-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GFX11-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half
 ; GFX11-NEXT:    ret half [[TMP7]]
@@ -2074,24 +2068,23 @@ define float @test_atomicrmw_fadd_f32_local_strictfp(ptr addrspace(3) %ptr, floa
 
 define bfloat @test_atomicrmw_fadd_bf16_local(ptr addrspace(3) %ptr, bfloat %value) {
 ; CI-LABEL: @test_atomicrmw_fadd_bf16_local(
-; CI-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; CI-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; CI-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CI-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CI-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CI-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; CI-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; CI-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; CI-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; CI-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; CI-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; CI-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; CI-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; CI-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CI:       atomicrmw.start:
 ; CI-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CI-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; CI-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; CI-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; CI-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
 ; CI-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
 ; CI-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
 ; CI-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; CI-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; CI-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; CI-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; CI-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; CI-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
@@ -2099,30 +2092,29 @@ define bfloat @test_atomicrmw_fadd_bf16_local(ptr addrspace(3) %ptr, bfloat %val
 ; CI-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; CI-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; CI:       atomicrmw.end:
-; CI-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; CI-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; CI-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; CI-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
 ; CI-NEXT:    ret bfloat [[TMP7]]
 ;
 ; GFX9-LABEL: @test_atomicrmw_fadd_bf16_local(
-; GFX9-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GFX9-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GFX9-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX9-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX9-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX9-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GFX9-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GFX9-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GFX9-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GFX9-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GFX9-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GFX9-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GFX9-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GFX9-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GFX9:       atomicrmw.start:
 ; GFX9-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX9-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GFX9-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GFX9-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GFX9-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
 ; GFX9-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
 ; GFX9-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
 ; GFX9-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX9-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GFX9-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GFX9-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GFX9-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GFX9-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
@@ -2130,30 +2122,29 @@ define bfloat @test_atomicrmw_fadd_bf16_local(ptr addrspace(3) %ptr, bfloat %val
 ; GFX9-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; GFX9-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GFX9:       atomicrmw.end:
-; GFX9-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GFX9-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GFX9-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GFX9-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
 ; GFX9-NEXT:    ret bfloat [[TMP7]]
 ;
 ; GFX908-LABEL: @test_atomicrmw_fadd_bf16_local(
-; GFX908-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GFX908-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GFX908-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX908-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX908-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX908-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GFX908-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GFX908-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GFX908-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GFX908-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GFX908-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GFX908-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GFX908-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GFX908:       atomicrmw.start:
 ; GFX908-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX908-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GFX908-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GFX908-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GFX908-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
 ; GFX908-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
 ; GFX908-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
 ; GFX908-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX908-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GFX908-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GFX908-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GFX908-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GFX908-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
@@ -2161,30 +2152,29 @@ define bfloat @test_atomicrmw_fadd_bf16_local(ptr addrspace(3) %ptr, bfloat %val
 ; GFX908-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GFX908:       atomicrmw.end:
-; GFX908-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GFX908-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GFX908-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GFX908-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
 ; GFX908-NEXT:    ret bfloat [[TMP7]]
 ;
 ; GFX90A-LABEL: @test_atomicrmw_fadd_bf16_local(
-; GFX90A-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GFX90A-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GFX90A-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX90A-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX90A-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX90A-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GFX90A-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GFX90A-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GFX90A-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GFX90A-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GFX90A-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GFX90A-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GFX90A-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GFX90A:       atomicrmw.start:
 ; GFX90A-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX90A-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GFX90A-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GFX90A-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
 ; GFX90A-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
 ; GFX90A-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
 ; GFX90A-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX90A-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GFX90A-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GFX90A-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GFX90A-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GFX90A-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
@@ -2192,30 +2182,29 @@ define bfloat @test_atomicrmw_fadd_bf16_local(ptr addrspace(3) %ptr, bfloat %val
 ; GFX90A-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GFX90A:       atomicrmw.end:
-; GFX90A-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GFX90A-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GFX90A-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GFX90A-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
 ; GFX90A-NEXT:    ret bfloat [[TMP7]]
 ;
 ; GFX940-LABEL: @test_atomicrmw_fadd_bf16_local(
-; GFX940-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GFX940-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GFX940-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX940-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX940-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX940-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GFX940-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GFX940-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GFX940-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GFX940-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GFX940-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GFX940-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GFX940-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GFX940:       atomicrmw.start:
 ; GFX940-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX940-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GFX940-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GFX940-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GFX940-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
 ; GFX940-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
 ; GFX940-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
 ; GFX940-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX940-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GFX940-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GFX940-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GFX940-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GFX940-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
@@ -2223,30 +2212,29 @@ define bfloat @test_atomicrmw_fadd_bf16_local(ptr addrspace(3) %ptr, bfloat %val
 ; GFX940-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GFX940:       atomicrmw.end:
-; GFX940-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GFX940-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GFX940-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GFX940-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
 ; GFX940-NEXT:    ret bfloat [[TMP7]]
 ;
 ; GFX11-LABEL: @test_atomicrmw_fadd_bf16_local(
-; GFX11-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GFX11-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GFX11-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX11-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX11-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX11-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GFX11-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GFX11-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GFX11-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GFX11-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GFX11-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GFX11-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GFX11-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GFX11:       atomicrmw.start:
 ; GFX11-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX11-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GFX11-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GFX11-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GFX11-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
 ; GFX11-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
 ; GFX11-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
 ; GFX11-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX11-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GFX11-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GFX11-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GFX11-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GFX11-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
@@ -2254,7 +2242,7 @@ define bfloat @test_atomicrmw_fadd_bf16_local(ptr addrspace(3) %ptr, bfloat %val
 ; GFX11-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GFX11:       atomicrmw.end:
-; GFX11-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GFX11-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GFX11-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GFX11-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
 ; GFX11-NEXT:    ret bfloat [[TMP7]]
@@ -3040,24 +3028,23 @@ define bfloat @test_atomicrmw_fadd_bf16_global_system_align4(ptr addrspace(1) %p
 
 define bfloat @test_atomicrmw_fadd_bf16_local_strictfp(ptr addrspace(3) %ptr, bfloat %value) #2 {
 ; CI-LABEL: @test_atomicrmw_fadd_bf16_local_strictfp(
-; CI-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; CI-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; CI-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CI-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CI-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CI-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; CI-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; CI-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; CI-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; CI-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; CI-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; CI-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; CI-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; CI-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CI:       atomicrmw.start:
 ; CI-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CI-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; CI-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; CI-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; CI-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
 ; CI-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
 ; CI-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
 ; CI-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; CI-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; CI-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; CI-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; CI-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; CI-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
@@ -3065,30 +3052,29 @@ define bfloat @test_atomicrmw_fadd_bf16_local_strictfp(ptr addrspace(3) %ptr, bf
 ; CI-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; CI-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; CI:       atomicrmw.end:
-; CI-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; CI-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; CI-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; CI-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
 ; CI-NEXT:    ret bfloat [[TMP7]]
 ;
 ; GFX9-LABEL: @test_atomicrmw_fadd_bf16_local_strictfp(
-; GFX9-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GFX9-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GFX9-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX9-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX9-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX9-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GFX9-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GFX9-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GFX9-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GFX9-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GFX9-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GFX9-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GFX9-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GFX9-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GFX9:       atomicrmw.start:
 ; GFX9-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX9-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GFX9-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GFX9-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GFX9-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
 ; GFX9-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
 ; GFX9-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
 ; GFX9-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX9-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GFX9-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GFX9-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GFX9-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GFX9-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
@@ -3096,30 +3082,29 @@ define bfloat @test_atomicrmw_fadd_bf16_local_strictfp(ptr addrspace(3) %ptr, bf
 ; GFX9-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; GFX9-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GFX9:       atomicrmw.end:
-; GFX9-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GFX9-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GFX9-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GFX9-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
 ; GFX9-NEXT:    ret bfloat [[TMP7]]
 ;
 ; GFX908-LABEL: @test_atomicrmw_fadd_bf16_local_strictfp(
-; GFX908-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GFX908-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GFX908-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX908-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX908-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX908-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GFX908-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GFX908-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GFX908-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GFX908-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GFX908-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GFX908-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GFX908-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GFX908:       atomicrmw.start:
 ; GFX908-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX908-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GFX908-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GFX908-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GFX908-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
 ; GFX908-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
 ; GFX908-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
 ; GFX908-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX908-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GFX908-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GFX908-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GFX908-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GFX908-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
@@ -3127,30 +3112,29 @@ define bfloat @test_atomicrmw_fadd_bf16_local_strictfp(ptr addrspace(3) %ptr, bf
 ; GFX908-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GFX908:       atomicrmw.end:
-; GFX908-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GFX908-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GFX908-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GFX908-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
 ; GFX908-NEXT:    ret bfloat [[TMP7]]
 ;
 ; GFX90A-LABEL: @test_atomicrmw_fadd_bf16_local_strictfp(
-; GFX90A-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GFX90A-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GFX90A-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX90A-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX90A-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX90A-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GFX90A-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GFX90A-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GFX90A-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GFX90A-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GFX90A-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GFX90A-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GFX90A-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GFX90A:       atomicrmw.start:
 ; GFX90A-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX90A-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GFX90A-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GFX90A-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
 ; GFX90A-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
 ; GFX90A-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
 ; GFX90A-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX90A-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GFX90A-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GFX90A-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GFX90A-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GFX90A-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
@@ -3158,30 +3142,29 @@ define bfloat @test_atomicrmw_fadd_bf16_local_strictfp(ptr addrspace(3) %ptr, bf
 ; GFX90A-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GFX90A:       atomicrmw.end:
-; GFX90A-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GFX90A-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GFX90A-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GFX90A-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
 ; GFX90A-NEXT:    ret bfloat [[TMP7]]
 ;
 ; GFX940-LABEL: @test_atomicrmw_fadd_bf16_local_strictfp(
-; GFX940-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GFX940-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GFX940-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX940-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX940-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX940-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GFX940-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GFX940-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GFX940-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GFX940-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GFX940-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GFX940-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GFX940-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GFX940:       atomicrmw.start:
 ; GFX940-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX940-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GFX940-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GFX940-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GFX940-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
 ; GFX940-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
 ; GFX940-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
 ; GFX940-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX940-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GFX940-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GFX940-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GFX940-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GFX940-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
@@ -3189,30 +3172,29 @@ define bfloat @test_atomicrmw_fadd_bf16_local_strictfp(ptr addrspace(3) %ptr, bf
 ; GFX940-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GFX940:       atomicrmw.end:
-; GFX940-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GFX940-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GFX940-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GFX940-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
 ; GFX940-NEXT:    ret bfloat [[TMP7]]
 ;
 ; GFX11-LABEL: @test_atomicrmw_fadd_bf16_local_strictfp(
-; GFX11-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GFX11-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GFX11-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX11-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX11-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX11-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GFX11-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GFX11-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GFX11-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GFX11-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GFX11-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GFX11-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GFX11-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GFX11:       atomicrmw.start:
 ; GFX11-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX11-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GFX11-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GFX11-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GFX11-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
 ; GFX11-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
 ; GFX11-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
 ; GFX11-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX11-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GFX11-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GFX11-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GFX11-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GFX11-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
@@ -3220,7 +3202,7 @@ define bfloat @test_atomicrmw_fadd_bf16_local_strictfp(ptr addrspace(3) %ptr, bf
 ; GFX11-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GFX11:       atomicrmw.end:
-; GFX11-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GFX11-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GFX11-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GFX11-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
 ; GFX11-NEXT:    ret bfloat [[TMP7]]

diff  --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll
index 5c4462caafb84ec..9dfbe9b4eb7413d 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll
@@ -165,24 +165,23 @@ define half @test_atomicrmw_fmax_f16_global_align4(ptr addrspace(1) %ptr, half %
 
 define half @test_atomicrmw_fmax_f16_local(ptr addrspace(3) %ptr, half %value) {
 ; GCN-LABEL: @test_atomicrmw_fmax_f16_local(
-; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GCN-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GCN-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GCN-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GCN:       atomicrmw.start:
 ; GCN-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GCN-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half
 ; GCN-NEXT:    [[TMP5:%.*]] = call half @llvm.maxnum.f16(half [[TMP4]], half [[VALUE:%.*]])
 ; GCN-NEXT:    [[TMP6:%.*]] = bitcast half [[TMP5]] to i16
 ; GCN-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP6]] to i32
-; GCN-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GCN-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GCN-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GCN-NEXT:    [[TMP7:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
@@ -190,7 +189,7 @@ define half @test_atomicrmw_fmax_f16_local(ptr addrspace(3) %ptr, half %value) {
 ; GCN-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP7]], 0
 ; GCN-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GCN:       atomicrmw.end:
-; GCN-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GCN-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GCN-NEXT:    [[TMP8:%.*]] = bitcast i16 [[EXTRACTED3]] to half
 ; GCN-NEXT:    ret half [[TMP8]]

diff  --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll
index 0e32489286a16ad..5a732653b48b149 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll
@@ -165,24 +165,23 @@ define half @test_atomicrmw_fmin_f16_global_align4(ptr addrspace(1) %ptr, half %
 
 define half @test_atomicrmw_fmin_f16_local(ptr addrspace(3) %ptr, half %value) {
 ; GCN-LABEL: @test_atomicrmw_fmin_f16_local(
-; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GCN-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GCN-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GCN-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GCN:       atomicrmw.start:
 ; GCN-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GCN-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half
 ; GCN-NEXT:    [[TMP5:%.*]] = call half @llvm.minnum.f16(half [[TMP4]], half [[VALUE:%.*]])
 ; GCN-NEXT:    [[TMP6:%.*]] = bitcast half [[TMP5]] to i16
 ; GCN-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP6]] to i32
-; GCN-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GCN-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GCN-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GCN-NEXT:    [[TMP7:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
@@ -190,7 +189,7 @@ define half @test_atomicrmw_fmin_f16_local(ptr addrspace(3) %ptr, half %value) {
 ; GCN-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP7]], 0
 ; GCN-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GCN:       atomicrmw.end:
-; GCN-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GCN-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GCN-NEXT:    [[TMP8:%.*]] = bitcast i16 [[EXTRACTED3]] to half
 ; GCN-NEXT:    ret half [[TMP8]]

diff  --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fsub.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fsub.ll
index f1b15f4e466792e..9805c317b9215e5 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fsub.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fsub.ll
@@ -165,24 +165,23 @@ define half @test_atomicrmw_fsub_f16_global_align4(ptr addrspace(1) %ptr, half %
 
 define half @test_atomicrmw_fsub_f16_local(ptr addrspace(3) %ptr, half %value) {
 ; GCN-LABEL: @test_atomicrmw_fsub_f16_local(
-; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GCN-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GCN-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GCN-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GCN:       atomicrmw.start:
 ; GCN-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GCN-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half
 ; GCN-NEXT:    [[NEW:%.*]] = fsub half [[TMP4]], [[VALUE:%.*]]
 ; GCN-NEXT:    [[TMP5:%.*]] = bitcast half [[NEW]] to i16
 ; GCN-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GCN-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GCN-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GCN-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GCN-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
@@ -190,7 +189,7 @@ define half @test_atomicrmw_fsub_f16_local(ptr addrspace(3) %ptr, half %value) {
 ; GCN-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; GCN-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GCN:       atomicrmw.end:
-; GCN-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GCN-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GCN-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half
 ; GCN-NEXT:    ret half [[TMP7]]
@@ -285,24 +284,23 @@ define float @test_atomicrmw_fsub_f32_global_strictfp(ptr addrspace(1) %ptr, flo
 
 define bfloat @test_atomicrmw_fadd_bf16_local(ptr addrspace(3) %ptr, bfloat %value) {
 ; GCN-LABEL: @test_atomicrmw_fadd_bf16_local(
-; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GCN-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GCN-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GCN-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GCN:       atomicrmw.start:
 ; GCN-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GCN-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
 ; GCN-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
 ; GCN-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
 ; GCN-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GCN-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GCN-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GCN-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GCN-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
@@ -310,7 +308,7 @@ define bfloat @test_atomicrmw_fadd_bf16_local(ptr addrspace(3) %ptr, bfloat %val
 ; GCN-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; GCN-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GCN:       atomicrmw.end:
-; GCN-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GCN-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GCN-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
 ; GCN-NEXT:    ret bfloat [[TMP7]]
@@ -471,24 +469,23 @@ define bfloat @test_atomicrmw_fadd_bf16_global_system_align4(ptr addrspace(1) %p
 
 define bfloat @test_atomicrmw_fadd_bf16_local_strictfp(ptr addrspace(3) %ptr, bfloat %value) #2 {
 ; GCN-LABEL: @test_atomicrmw_fadd_bf16_local_strictfp(
-; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GCN-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GCN-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GCN-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GCN:       atomicrmw.start:
 ; GCN-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GCN-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
 ; GCN-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
 ; GCN-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
 ; GCN-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GCN-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GCN-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GCN-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GCN-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
@@ -496,7 +493,7 @@ define bfloat @test_atomicrmw_fadd_bf16_local_strictfp(ptr addrspace(3) %ptr, bf
 ; GCN-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; GCN-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GCN:       atomicrmw.end:
-; GCN-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GCN-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GCN-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
 ; GCN-NEXT:    ret bfloat [[TMP7]]

diff  --git a/llvm/test/Transforms/AtomicExpand/PowerPC/cmpxchg.ll b/llvm/test/Transforms/AtomicExpand/PowerPC/cmpxchg.ll
index 9a327f087bcd164..169d73cc0308d39 100644
--- a/llvm/test/Transforms/AtomicExpand/PowerPC/cmpxchg.ll
+++ b/llvm/test/Transforms/AtomicExpand/PowerPC/cmpxchg.ll
@@ -33,16 +33,12 @@ define i1 @test_cmpxchg_seq_cst(ptr %addr, i128 %desire, i128 %new) {
 ; PWR7-NEXT:    [[TMP0:%.*]] = alloca i128, align 8
 ; PWR7-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[TMP0]])
 ; PWR7-NEXT:    store i128 [[DESIRE:%.*]], ptr [[TMP0]], align 8
-; PWR7-NEXT:    [[TMP1:%.*]] = alloca i128, align 8
-; PWR7-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[TMP1]])
-; PWR7-NEXT:    store i128 [[NEW:%.*]], ptr [[TMP1]], align 8
-; PWR7-NEXT:    [[TMP2:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 16, ptr [[ADDR:%.*]], ptr [[TMP0]], ptr [[TMP1]], i32 5, i32 5)
-; PWR7-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[TMP1]])
-; PWR7-NEXT:    [[TMP3:%.*]] = load i128, ptr [[TMP0]], align 8
+; PWR7-NEXT:    [[TMP1:%.*]] = call zeroext i1 @__atomic_compare_exchange_16(ptr [[ADDR:%.*]], ptr [[TMP0]], i128 [[NEW:%.*]], i32 5, i32 5)
+; PWR7-NEXT:    [[TMP2:%.*]] = load i128, ptr [[TMP0]], align 8
 ; PWR7-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[TMP0]])
-; PWR7-NEXT:    [[TMP4:%.*]] = insertvalue { i128, i1 } poison, i128 [[TMP3]], 0
-; PWR7-NEXT:    [[TMP5:%.*]] = insertvalue { i128, i1 } [[TMP4]], i1 [[TMP2]], 1
-; PWR7-NEXT:    [[SUCC:%.*]] = extractvalue { i128, i1 } [[TMP5]], 1
+; PWR7-NEXT:    [[TMP3:%.*]] = insertvalue { i128, i1 } poison, i128 [[TMP2]], 0
+; PWR7-NEXT:    [[TMP4:%.*]] = insertvalue { i128, i1 } [[TMP3]], i1 [[TMP1]], 1
+; PWR7-NEXT:    [[SUCC:%.*]] = extractvalue { i128, i1 } [[TMP4]], 1
 ; PWR7-NEXT:    ret i1 [[SUCC]]
 ;
 entry:

diff  --git a/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-libcall.ll b/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-libcall.ll
index 826b9be7a1e5d33..8d71966c04d0396 100644
--- a/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-libcall.ll
+++ b/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-libcall.ll
@@ -4,10 +4,10 @@
 
 define i256 @atomic_load256_libcall(ptr %ptr) nounwind {
 ; CHECK-LABEL: @atomic_load256_libcall(
-; CHECK-NEXT:    [[TMP1:%.*]] = alloca i256, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca i256, align 16
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[TMP1]])
-; CHECK-NEXT:    call void @__atomic_load(i64 32, ptr [[PTR:%.*]], ptr [[TMP1]], i32 0)
-; CHECK-NEXT:    [[TMP2:%.*]] = load i256, ptr [[TMP1]], align 8
+; CHECK-NEXT:    call void @__atomic_load(i32 32, ptr [[PTR:%.*]], ptr [[TMP1]], i32 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = load i256, ptr [[TMP1]], align 16
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[TMP1]])
 ; CHECK-NEXT:    ret i256 [[TMP2]]
 ;
@@ -18,10 +18,10 @@ define i256 @atomic_load256_libcall(ptr %ptr) nounwind {
 define i256 @atomic_load256_libcall_as1(ptr addrspace(1) %ptr) nounwind {
 ; CHECK-LABEL: @atomic_load256_libcall_as1(
 ; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[PTR:%.*]] to ptr
-; CHECK-NEXT:    [[TMP2:%.*]] = alloca i256, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = alloca i256, align 16
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[TMP2]])
-; CHECK-NEXT:    call void @__atomic_load(i64 32, ptr [[TMP1]], ptr [[TMP2]], i32 0)
-; CHECK-NEXT:    [[TMP3:%.*]] = load i256, ptr [[TMP2]], align 8
+; CHECK-NEXT:    call void @__atomic_load(i32 32, ptr [[TMP1]], ptr [[TMP2]], i32 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = load i256, ptr [[TMP2]], align 16
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[TMP2]])
 ; CHECK-NEXT:    ret i256 [[TMP3]]
 ;

diff  --git a/llvm/test/Transforms/CodeGenPrepare/AMDGPU/sink-addrspacecast.ll b/llvm/test/Transforms/CodeGenPrepare/AMDGPU/sink-addrspacecast.ll
index be7ae0ef3f42c6f..7e6020629df8234 100644
--- a/llvm/test/Transforms/CodeGenPrepare/AMDGPU/sink-addrspacecast.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/AMDGPU/sink-addrspacecast.ll
@@ -7,10 +7,10 @@ define i64 @no_sink_local_to_flat(i1 %pred, ptr addrspace(3) %ptr) {
 ; CHECK-NEXT:    [[PTR_CAST:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr
 ; CHECK-NEXT:    br i1 [[PRED]], label [[L1:%.*]], label [[L2:%.*]]
 ; CHECK:       l1:
-; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr addrspace(3) [[PTR]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr addrspace(3) [[PTR]], align 8
 ; CHECK-NEXT:    ret i64 [[V1]]
 ; CHECK:       l2:
-; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[PTR_CAST]], align 4
+; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[PTR_CAST]], align 8
 ; CHECK-NEXT:    ret i64 [[V2]]
 ;
   %ptr_cast = addrspacecast ptr addrspace(3) %ptr to ptr
@@ -31,10 +31,10 @@ define i64 @no_sink_private_to_flat(i1 %pred, ptr addrspace(5) %ptr) {
 ; CHECK-NEXT:    [[PTR_CAST:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr
 ; CHECK-NEXT:    br i1 [[PRED]], label [[L1:%.*]], label [[L2:%.*]]
 ; CHECK:       l1:
-; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr addrspace(5) [[PTR]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr addrspace(5) [[PTR]], align 8
 ; CHECK-NEXT:    ret i64 [[V1]]
 ; CHECK:       l2:
-; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[PTR_CAST]], align 4
+; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[PTR_CAST]], align 8
 ; CHECK-NEXT:    ret i64 [[V2]]
 ;
   %ptr_cast = addrspacecast ptr addrspace(5) %ptr to ptr
@@ -55,11 +55,11 @@ define i64 @sink_global_to_flat(i1 %pred, ptr addrspace(1) %ptr) {
 ; CHECK-SAME: i1 [[PRED:%.*]], ptr addrspace(1) [[PTR:%.*]]) {
 ; CHECK-NEXT:    br i1 [[PRED]], label [[L1:%.*]], label [[L2:%.*]]
 ; CHECK:       l1:
-; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8
 ; CHECK-NEXT:    ret i64 [[V1]]
 ; CHECK:       l2:
 ; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
-; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[TMP1]], align 8
 ; CHECK-NEXT:    ret i64 [[V2]]
 ;
   %ptr_cast = addrspacecast ptr addrspace(1) %ptr to ptr
@@ -79,11 +79,11 @@ define i64 @sink_flat_to_global(i1 %pred, ptr %ptr) {
 ; CHECK-SAME: i1 [[PRED:%.*]], ptr [[PTR:%.*]]) {
 ; CHECK-NEXT:    br i1 [[PRED]], label [[L1:%.*]], label [[L2:%.*]]
 ; CHECK:       l1:
-; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[PTR]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[PTR]], align 8
 ; CHECK-NEXT:    ret i64 [[V1]]
 ; CHECK:       l2:
 ; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(1)
-; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr addrspace(1) [[TMP1]], align 4
+; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr addrspace(1) [[TMP1]], align 8
 ; CHECK-NEXT:    ret i64 [[V2]]
 ;
   %ptr_cast = addrspacecast ptr %ptr to ptr addrspace(1)
@@ -103,11 +103,11 @@ define i64 @sink_flat_to_constant(i1 %pred, ptr %ptr) {
 ; CHECK-SAME: i1 [[PRED:%.*]], ptr [[PTR:%.*]]) {
 ; CHECK-NEXT:    br i1 [[PRED]], label [[L1:%.*]], label [[L2:%.*]]
 ; CHECK:       l1:
-; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[PTR]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[PTR]], align 8
 ; CHECK-NEXT:    ret i64 [[V1]]
 ; CHECK:       l2:
 ; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(4)
-; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr addrspace(4) [[TMP1]], align 4
+; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr addrspace(4) [[TMP1]], align 8
 ; CHECK-NEXT:    ret i64 [[V2]]
 ;
   %ptr_cast = addrspacecast ptr %ptr to ptr addrspace(4)
@@ -125,13 +125,13 @@ l2:
 define i64 @sink_flat_to_local(i1 %pred, ptr %ptr) {
 ; CHECK-LABEL: define i64 @sink_flat_to_local(
 ; CHECK-SAME: i1 [[PRED:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[PTR_CAST:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(3)
 ; CHECK-NEXT:    br i1 [[PRED]], label [[L1:%.*]], label [[L2:%.*]]
 ; CHECK:       l1:
-; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[PTR]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[PTR]], align 8
 ; CHECK-NEXT:    ret i64 [[V1]]
 ; CHECK:       l2:
-; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(3)
-; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr addrspace(3) [[TMP1]], align 4
+; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr addrspace(3) [[PTR_CAST]], align 8
 ; CHECK-NEXT:    ret i64 [[V2]]
 ;
   %ptr_cast = addrspacecast ptr %ptr to ptr addrspace(3)
@@ -149,13 +149,13 @@ l2:
 define i64 @sink_flat_to_private(i1 %pred, ptr %ptr) {
 ; CHECK-LABEL: define i64 @sink_flat_to_private(
 ; CHECK-SAME: i1 [[PRED:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[PTR_CAST:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5)
 ; CHECK-NEXT:    br i1 [[PRED]], label [[L1:%.*]], label [[L2:%.*]]
 ; CHECK:       l1:
-; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[PTR]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[PTR]], align 8
 ; CHECK-NEXT:    ret i64 [[V1]]
 ; CHECK:       l2:
-; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5)
-; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr addrspace(5) [[TMP1]], align 4
+; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr addrspace(5) [[PTR_CAST]], align 8
 ; CHECK-NEXT:    ret i64 [[V2]]
 ;
   %ptr_cast = addrspacecast ptr %ptr to ptr addrspace(5)

diff  --git a/llvm/test/Transforms/DivRemPairs/X86/div-expanded-rem-pair.ll b/llvm/test/Transforms/DivRemPairs/X86/div-expanded-rem-pair.ll
index 9a13d8bb9e1fccc..5340527a046a543 100644
--- a/llvm/test/Transforms/DivRemPairs/X86/div-expanded-rem-pair.ll
+++ b/llvm/test/Transforms/DivRemPairs/X86/div-expanded-rem-pair.ll
@@ -214,7 +214,7 @@ define i64 @remainder_triangle_i64(i64 %a, i64 %b, ptr %rp) {
 ; CHECK-NEXT:    [[REM:%.*]] = urem i64 [[A]], [[B]]
 ; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[END:%.*]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    store i64 [[REM]], ptr [[RP]], align 4
+; CHECK-NEXT:    store i64 [[REM]], ptr [[RP]], align 8
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
 ; CHECK-NEXT:    ret i64 [[DIV]]
@@ -246,7 +246,7 @@ define i128 @remainder_triangle_i128(i128 %a, i128 %b, ptr %rp) {
 ; CHECK:       if.then:
 ; CHECK-NEXT:    [[TMP0:%.*]] = mul i128 [[DIV]], [[B_FROZEN]]
 ; CHECK-NEXT:    [[REM_DECOMPOSED:%.*]] = sub i128 [[A_FROZEN]], [[TMP0]]
-; CHECK-NEXT:    store i128 [[REM_DECOMPOSED]], ptr [[RP]], align 4
+; CHECK-NEXT:    store i128 [[REM_DECOMPOSED]], ptr [[RP]], align 16
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
 ; CHECK-NEXT:    ret i128 [[DIV]]
@@ -275,7 +275,7 @@ define i64 @remainder_triangle_i64_multiple_rem_edges(i64 %a, i64 %b, i64 %c, pt
 ; CHECK-NEXT:    i64 2, label [[SW_BB]]
 ; CHECK-NEXT:    ]
 ; CHECK:       sw.bb:
-; CHECK-NEXT:    store i64 [[REM]], ptr [[RP:%.*]], align 4
+; CHECK-NEXT:    store i64 [[REM]], ptr [[RP:%.*]], align 8
 ; CHECK-NEXT:    br label [[SW_DEFAULT]]
 ; CHECK:       sw.default:
 ; CHECK-NEXT:    ret i64 [[DIV]]
@@ -306,7 +306,7 @@ define i64 @remainder_triangle_i64_multiple_div_edges(i64 %a, i64 %b, i64 %c, pt
 ; CHECK-NEXT:    i64 2, label [[SW_BB]]
 ; CHECK-NEXT:    ]
 ; CHECK:       sw.default:
-; CHECK-NEXT:    store i64 [[REM]], ptr [[RP:%.*]], align 4
+; CHECK-NEXT:    store i64 [[REM]], ptr [[RP:%.*]], align 8
 ; CHECK-NEXT:    br label [[SW_BB]]
 ; CHECK:       sw.bb:
 ; CHECK-NEXT:    ret i64 [[DIV]]
@@ -339,7 +339,7 @@ define i64 @remainder_triangle_i64_maythrow_rem(i64 %a, i64 %b, ptr %rp) {
 ; CHECK:       if.then:
 ; CHECK-NEXT:    call void @maythrow()
 ; CHECK-NEXT:    [[REM:%.*]] = urem i64 [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    store i64 [[REM]], ptr [[RP]], align 4
+; CHECK-NEXT:    store i64 [[REM]], ptr [[RP]], align 8
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
 ; CHECK-NEXT:    [[DIV:%.*]] = udiv i64 [[A]], [[B]]
@@ -369,7 +369,7 @@ define i64 @remainder_triangle_i64_maythrow_div(i64 %a, i64 %b, ptr %rp) {
 ; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[END:%.*]]
 ; CHECK:       if.then:
 ; CHECK-NEXT:    [[REM:%.*]] = urem i64 [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    store i64 [[REM]], ptr [[RP]], align 4
+; CHECK-NEXT:    store i64 [[REM]], ptr [[RP]], align 8
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
 ; CHECK-NEXT:    call void @maythrow()
@@ -401,7 +401,7 @@ define i128 @remainder_triangle_i128_maythrow_rem(i128 %a, i128 %b, ptr %rp) {
 ; CHECK:       if.then:
 ; CHECK-NEXT:    call void @maythrow()
 ; CHECK-NEXT:    [[REM:%.*]] = urem i128 [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    store i128 [[REM]], ptr [[RP]], align 4
+; CHECK-NEXT:    store i128 [[REM]], ptr [[RP]], align 16
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
 ; CHECK-NEXT:    [[DIV:%.*]] = udiv i128 [[A]], [[B]]
@@ -431,7 +431,7 @@ define i128 @remainder_triangle_i128_maythrow_div(i128 %a, i128 %b, ptr %rp) {
 ; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[END:%.*]]
 ; CHECK:       if.then:
 ; CHECK-NEXT:    [[REM:%.*]] = urem i128 [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    store i128 [[REM]], ptr [[RP]], align 4
+; CHECK-NEXT:    store i128 [[REM]], ptr [[RP]], align 16
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
 ; CHECK-NEXT:    call void @maythrow()
@@ -464,7 +464,7 @@ define i64 @remainder_not_triangle_i32(i64 %a, i64 %b, i64 %c, ptr %rp) {
 ; CHECK-NEXT:    ]
 ; CHECK:       sw.bb:
 ; CHECK-NEXT:    [[REM:%.*]] = urem i64 [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    store i64 [[REM]], ptr [[RP:%.*]], align 4
+; CHECK-NEXT:    store i64 [[REM]], ptr [[RP:%.*]], align 8
 ; CHECK-NEXT:    br label [[SW_BB1]]
 ; CHECK:       sw.bb1:
 ; CHECK-NEXT:    [[DIV:%.*]] = udiv i64 [[A]], [[B]]
@@ -501,7 +501,7 @@ define i64 @remainder_not_triangle_i32_2(i64 %a, i64 %b, i64 %c, ptr %rp) {
 ; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_END3:%.*]], label [[IF_THEN:%.*]]
 ; CHECK:       if.then:
 ; CHECK-NEXT:    [[REM:%.*]] = urem i64 [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    store i64 [[REM]], ptr [[RP]], align 4
+; CHECK-NEXT:    store i64 [[REM]], ptr [[RP]], align 8
 ; CHECK-NEXT:    [[TOBOOL1_NOT:%.*]] = icmp eq i64 [[C:%.*]], 0
 ; CHECK-NEXT:    br i1 [[TOBOOL1_NOT]], label [[IF_END3]], label [[RETURN:%.*]]
 ; CHECK:       if.end3:

diff  --git a/llvm/test/Transforms/IndVarSimplify/ARM/indvar-cost.ll b/llvm/test/Transforms/IndVarSimplify/ARM/indvar-cost.ll
index 2c92f2a3153627a..b047bf5fc120c3d 100644
--- a/llvm/test/Transforms/IndVarSimplify/ARM/indvar-cost.ll
+++ b/llvm/test/Transforms/IndVarSimplify/ARM/indvar-cost.ll
@@ -13,6 +13,9 @@ define dso_local arm_aapcscc void @arm_conv_fast_q15(ptr %pSrcA, i32 %srcALen, p
 ; CHECK-T1-NEXT:    [[CMP41080:%.*]] = icmp eq i32 [[SUB]], 0
 ; CHECK-T1-NEXT:    br i1 [[CMP41080]], label [[WHILE_END13:%.*]], label [[WHILE_COND5_PREHEADER_PREHEADER:%.*]]
 ; CHECK-T1:       while.cond5.preheader.preheader:
+; CHECK-T1-NEXT:    [[TMP0:%.*]] = add i32 [[SRCALEN_SRCBLEN]], -2
+; CHECK-T1-NEXT:    [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP0]], i32 2)
+; CHECK-T1-NEXT:    [[TMP1:%.*]] = add nuw nsw i32 [[UMIN]], 2
 ; CHECK-T1-NEXT:    br label [[WHILE_COND5_PREHEADER:%.*]]
 ; CHECK-T1:       while.cond5.preheader:
 ; CHECK-T1-NEXT:    [[COUNT_01084:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_END:%.*]] ], [ 1, [[WHILE_COND5_PREHEADER_PREHEADER]] ]
@@ -26,11 +29,11 @@ define dso_local arm_aapcscc void @arm_conv_fast_q15(ptr %pSrcA, i32 %srcALen, p
 ; CHECK-T1-NEXT:    [[PY_11076:%.*]] = phi ptr [ [[INCDEC_PTR8:%.*]], [[WHILE_BODY7]] ], [ [[PY_01082]], [[WHILE_COND5_PREHEADER]] ]
 ; CHECK-T1-NEXT:    [[PX_11075:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_BODY7]] ], [ [[PSRCB_PSRCA]], [[WHILE_COND5_PREHEADER]] ]
 ; CHECK-T1-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i16, ptr [[PX_11075]], i32 1
-; CHECK-T1-NEXT:    [[TMP0:%.*]] = load i16, ptr [[PX_11075]], align 2
-; CHECK-T1-NEXT:    [[CONV:%.*]] = sext i16 [[TMP0]] to i32
+; CHECK-T1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[PX_11075]], align 2
+; CHECK-T1-NEXT:    [[CONV:%.*]] = sext i16 [[TMP2]] to i32
 ; CHECK-T1-NEXT:    [[INCDEC_PTR8]] = getelementptr inbounds i16, ptr [[PY_11076]], i32 -1
-; CHECK-T1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[PY_11076]], align 2
-; CHECK-T1-NEXT:    [[CONV9:%.*]] = sext i16 [[TMP1]] to i32
+; CHECK-T1-NEXT:    [[TMP3:%.*]] = load i16, ptr [[PY_11076]], align 2
+; CHECK-T1-NEXT:    [[CONV9:%.*]] = sext i16 [[TMP3]] to i32
 ; CHECK-T1-NEXT:    [[MUL_I:%.*]] = mul nsw i32 [[CONV9]], [[CONV]]
 ; CHECK-T1-NEXT:    [[SHR3_I:%.*]] = ashr i32 [[CONV]], 16
 ; CHECK-T1-NEXT:    [[SHR4_I:%.*]] = ashr i32 [[CONV9]], 16
@@ -42,17 +45,15 @@ define dso_local arm_aapcscc void @arm_conv_fast_q15(ptr %pSrcA, i32 %srcALen, p
 ; CHECK-T1-NEXT:    br i1 [[CMP6]], label [[WHILE_END]], label [[WHILE_BODY7]]
 ; CHECK-T1:       while.end:
 ; CHECK-T1-NEXT:    [[ADD6_I_LCSSA:%.*]] = phi i32 [ [[ADD6_I]], [[WHILE_BODY7]] ]
-; CHECK-T1-NEXT:    [[TMP2:%.*]] = lshr i32 [[ADD6_I_LCSSA]], 15
-; CHECK-T1-NEXT:    [[CONV10:%.*]] = trunc i32 [[TMP2]] to i16
+; CHECK-T1-NEXT:    [[TMP4:%.*]] = lshr i32 [[ADD6_I_LCSSA]], 15
+; CHECK-T1-NEXT:    [[CONV10:%.*]] = trunc i32 [[TMP4]] to i16
 ; CHECK-T1-NEXT:    [[INCDEC_PTR11]] = getelementptr inbounds i16, ptr [[POUT_01081]], i32 1
 ; CHECK-T1-NEXT:    store i16 [[CONV10]], ptr [[POUT_01081]], align 2
 ; CHECK-T1-NEXT:    [[ADD_PTR]] = getelementptr inbounds i16, ptr [[PSRCA_PSRCB]], i32 [[COUNT_01084]]
 ; CHECK-T1-NEXT:    [[INC]] = add nuw nsw i32 [[COUNT_01084]], 1
 ; CHECK-T1-NEXT:    [[DEC12]] = add i32 [[BLOCKSIZE1_01083]], -1
-; CHECK-T1-NEXT:    [[CMP3:%.*]] = icmp ult i32 [[COUNT_01084]], 3
-; CHECK-T1-NEXT:    [[CMP4:%.*]] = icmp ne i32 [[DEC12]], 0
-; CHECK-T1-NEXT:    [[TMP3:%.*]] = and i1 [[CMP4]], [[CMP3]]
-; CHECK-T1-NEXT:    br i1 [[TMP3]], label [[WHILE_COND5_PREHEADER]], label [[WHILE_END13_LOOPEXIT:%.*]]
+; CHECK-T1-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[INC]], [[TMP1]]
+; CHECK-T1-NEXT:    br i1 [[EXITCOND]], label [[WHILE_COND5_PREHEADER]], label [[WHILE_END13_LOOPEXIT:%.*]]
 ; CHECK-T1:       while.end13.loopexit:
 ; CHECK-T1-NEXT:    [[INCDEC_PTR11_LCSSA:%.*]] = phi ptr [ [[INCDEC_PTR11]], [[WHILE_END]] ]
 ; CHECK-T1-NEXT:    [[ADD_PTR_LCSSA:%.*]] = phi ptr [ [[ADD_PTR]], [[WHILE_END]] ]
@@ -85,34 +86,34 @@ define dso_local arm_aapcscc void @arm_conv_fast_q15(ptr %pSrcA, i32 %srcALen, p
 ; CHECK-T1-NEXT:    [[PY_31056:%.*]] = phi ptr [ [[ADD_PTR_I884:%.*]], [[WHILE_BODY23]] ], [ [[PY_21070]], [[WHILE_BODY23_PREHEADER]] ]
 ; CHECK-T1-NEXT:    [[PX_31055:%.*]] = phi ptr [ [[ADD_PTR_I890:%.*]], [[WHILE_BODY23]] ], [ [[PSRCB_PSRCA]], [[WHILE_BODY23_PREHEADER]] ]
 ; CHECK-T1-NEXT:    [[ARRAYIDX_I907:%.*]] = getelementptr inbounds i16, ptr [[PX_31055]], i32 1
-; CHECK-T1-NEXT:    [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_I907]], align 2
-; CHECK-T1-NEXT:    [[TMP5:%.*]] = load i16, ptr [[PX_31055]], align 2
+; CHECK-T1-NEXT:    [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX_I907]], align 2
+; CHECK-T1-NEXT:    [[TMP6:%.*]] = load i16, ptr [[PX_31055]], align 2
 ; CHECK-T1-NEXT:    [[ADD_PTR_I912:%.*]] = getelementptr inbounds i16, ptr [[PX_31055]], i32 2
 ; CHECK-T1-NEXT:    [[ARRAYIDX_I901:%.*]] = getelementptr inbounds i16, ptr [[PY_31056]], i32 1
-; CHECK-T1-NEXT:    [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_I901]], align 2
-; CHECK-T1-NEXT:    [[TMP7:%.*]] = load i16, ptr [[PY_31056]], align 2
+; CHECK-T1-NEXT:    [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX_I901]], align 2
+; CHECK-T1-NEXT:    [[TMP8:%.*]] = load i16, ptr [[PY_31056]], align 2
 ; CHECK-T1-NEXT:    [[ADD_PTR_I906:%.*]] = getelementptr inbounds i16, ptr [[PY_31056]], i32 -2
-; CHECK-T1-NEXT:    [[SHR_I892:%.*]] = sext i16 [[TMP5]] to i32
-; CHECK-T1-NEXT:    [[SHR1_I893:%.*]] = sext i16 [[TMP6]] to i32
+; CHECK-T1-NEXT:    [[SHR_I892:%.*]] = sext i16 [[TMP6]] to i32
+; CHECK-T1-NEXT:    [[SHR1_I893:%.*]] = sext i16 [[TMP7]] to i32
 ; CHECK-T1-NEXT:    [[MUL_I894:%.*]] = mul nsw i32 [[SHR1_I893]], [[SHR_I892]]
-; CHECK-T1-NEXT:    [[SHR2_I895:%.*]] = sext i16 [[TMP4]] to i32
-; CHECK-T1-NEXT:    [[SHR4_I897:%.*]] = sext i16 [[TMP7]] to i32
+; CHECK-T1-NEXT:    [[SHR2_I895:%.*]] = sext i16 [[TMP5]] to i32
+; CHECK-T1-NEXT:    [[SHR4_I897:%.*]] = sext i16 [[TMP8]] to i32
 ; CHECK-T1-NEXT:    [[MUL5_I898:%.*]] = mul nsw i32 [[SHR4_I897]], [[SHR2_I895]]
 ; CHECK-T1-NEXT:    [[ADD_I899:%.*]] = add i32 [[MUL_I894]], [[SUM_11057]]
 ; CHECK-T1-NEXT:    [[ADD6_I900:%.*]] = add i32 [[ADD_I899]], [[MUL5_I898]]
 ; CHECK-T1-NEXT:    [[ARRAYIDX_I885:%.*]] = getelementptr inbounds i16, ptr [[PX_31055]], i32 3
-; CHECK-T1-NEXT:    [[TMP8:%.*]] = load i16, ptr [[ARRAYIDX_I885]], align 2
-; CHECK-T1-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ADD_PTR_I912]], align 2
+; CHECK-T1-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX_I885]], align 2
+; CHECK-T1-NEXT:    [[TMP10:%.*]] = load i16, ptr [[ADD_PTR_I912]], align 2
 ; CHECK-T1-NEXT:    [[ADD_PTR_I890]] = getelementptr inbounds i16, ptr [[PX_31055]], i32 4
 ; CHECK-T1-NEXT:    [[ARRAYIDX_I879:%.*]] = getelementptr inbounds i16, ptr [[PY_31056]], i32 -1
-; CHECK-T1-NEXT:    [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX_I879]], align 2
-; CHECK-T1-NEXT:    [[TMP11:%.*]] = load i16, ptr [[ADD_PTR_I906]], align 2
+; CHECK-T1-NEXT:    [[TMP11:%.*]] = load i16, ptr [[ARRAYIDX_I879]], align 2
+; CHECK-T1-NEXT:    [[TMP12:%.*]] = load i16, ptr [[ADD_PTR_I906]], align 2
 ; CHECK-T1-NEXT:    [[ADD_PTR_I884]] = getelementptr inbounds i16, ptr [[PY_31056]], i32 -4
-; CHECK-T1-NEXT:    [[SHR_I870:%.*]] = sext i16 [[TMP9]] to i32
-; CHECK-T1-NEXT:    [[SHR1_I871:%.*]] = sext i16 [[TMP10]] to i32
+; CHECK-T1-NEXT:    [[SHR_I870:%.*]] = sext i16 [[TMP10]] to i32
+; CHECK-T1-NEXT:    [[SHR1_I871:%.*]] = sext i16 [[TMP11]] to i32
 ; CHECK-T1-NEXT:    [[MUL_I872:%.*]] = mul nsw i32 [[SHR1_I871]], [[SHR_I870]]
-; CHECK-T1-NEXT:    [[SHR2_I873:%.*]] = sext i16 [[TMP8]] to i32
-; CHECK-T1-NEXT:    [[SHR4_I875:%.*]] = sext i16 [[TMP11]] to i32
+; CHECK-T1-NEXT:    [[SHR2_I873:%.*]] = sext i16 [[TMP9]] to i32
+; CHECK-T1-NEXT:    [[SHR4_I875:%.*]] = sext i16 [[TMP12]] to i32
 ; CHECK-T1-NEXT:    [[MUL5_I876:%.*]] = mul nsw i32 [[SHR4_I875]], [[SHR2_I873]]
 ; CHECK-T1-NEXT:    [[ADD_I877:%.*]] = add i32 [[ADD6_I900]], [[MUL_I872]]
 ; CHECK-T1-NEXT:    [[ADD6_I878]] = add i32 [[ADD_I877]], [[MUL5_I876]]
@@ -140,11 +141,11 @@ define dso_local arm_aapcscc void @arm_conv_fast_q15(ptr %pSrcA, i32 %srcALen, p
 ; CHECK-T1-NEXT:    [[PY_41064:%.*]] = phi ptr [ [[INCDEC_PTR39:%.*]], [[WHILE_BODY36]] ], [ [[ADD_PTR32]], [[WHILE_BODY36_PREHEADER]] ]
 ; CHECK-T1-NEXT:    [[PX_41063:%.*]] = phi ptr [ [[INCDEC_PTR37:%.*]], [[WHILE_BODY36]] ], [ [[PX_3_LCSSA]], [[WHILE_BODY36_PREHEADER]] ]
 ; CHECK-T1-NEXT:    [[INCDEC_PTR37]] = getelementptr inbounds i16, ptr [[PX_41063]], i32 1
-; CHECK-T1-NEXT:    [[TMP12:%.*]] = load i16, ptr [[PX_41063]], align 2
-; CHECK-T1-NEXT:    [[CONV38:%.*]] = sext i16 [[TMP12]] to i32
+; CHECK-T1-NEXT:    [[TMP13:%.*]] = load i16, ptr [[PX_41063]], align 2
+; CHECK-T1-NEXT:    [[CONV38:%.*]] = sext i16 [[TMP13]] to i32
 ; CHECK-T1-NEXT:    [[INCDEC_PTR39]] = getelementptr inbounds i16, ptr [[PY_41064]], i32 -1
-; CHECK-T1-NEXT:    [[TMP13:%.*]] = load i16, ptr [[PY_41064]], align 2
-; CHECK-T1-NEXT:    [[CONV40:%.*]] = sext i16 [[TMP13]] to i32
+; CHECK-T1-NEXT:    [[TMP14:%.*]] = load i16, ptr [[PY_41064]], align 2
+; CHECK-T1-NEXT:    [[CONV40:%.*]] = sext i16 [[TMP14]] to i32
 ; CHECK-T1-NEXT:    [[MUL_I863:%.*]] = mul nsw i32 [[CONV40]], [[CONV38]]
 ; CHECK-T1-NEXT:    [[SHR3_I864:%.*]] = ashr i32 [[CONV38]], 16
 ; CHECK-T1-NEXT:    [[SHR4_I865:%.*]] = ashr i32 [[CONV40]], 16
@@ -159,8 +160,8 @@ define dso_local arm_aapcscc void @arm_conv_fast_q15(ptr %pSrcA, i32 %srcALen, p
 ; CHECK-T1-NEXT:    br label [[WHILE_END43]]
 ; CHECK-T1:       while.end43:
 ; CHECK-T1-NEXT:    [[SUM_2_LCSSA:%.*]] = phi i32 [ [[SUM_1_LCSSA]], [[WHILE_END31]] ], [ [[ADD6_I868_LCSSA]], [[WHILE_END43_LOOPEXIT]] ]
-; CHECK-T1-NEXT:    [[TMP14:%.*]] = lshr i32 [[SUM_2_LCSSA]], 15
-; CHECK-T1-NEXT:    [[CONV45:%.*]] = trunc i32 [[TMP14]] to i16
+; CHECK-T1-NEXT:    [[TMP15:%.*]] = lshr i32 [[SUM_2_LCSSA]], 15
+; CHECK-T1-NEXT:    [[CONV45:%.*]] = trunc i32 [[TMP15]] to i16
 ; CHECK-T1-NEXT:    [[INCDEC_PTR46]] = getelementptr inbounds i16, ptr [[POUT_11069]], i32 1
 ; CHECK-T1-NEXT:    store i16 [[CONV45]], ptr [[POUT_11069]], align 2
 ; CHECK-T1-NEXT:    [[SUB47:%.*]] = add i32 [[COUNT_11072]], -1
@@ -184,6 +185,9 @@ define dso_local arm_aapcscc void @arm_conv_fast_q15(ptr %pSrcA, i32 %srcALen, p
 ; CHECK-T2-NEXT:    [[CMP41080:%.*]] = icmp eq i32 [[SUB]], 0
 ; CHECK-T2-NEXT:    br i1 [[CMP41080]], label [[WHILE_END13:%.*]], label [[WHILE_COND5_PREHEADER_PREHEADER:%.*]]
 ; CHECK-T2:       while.cond5.preheader.preheader:
+; CHECK-T2-NEXT:    [[TMP0:%.*]] = add i32 [[SRCALEN_SRCBLEN]], -2
+; CHECK-T2-NEXT:    [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP0]], i32 2)
+; CHECK-T2-NEXT:    [[TMP1:%.*]] = add nuw nsw i32 [[UMIN]], 2
 ; CHECK-T2-NEXT:    br label [[WHILE_COND5_PREHEADER:%.*]]
 ; CHECK-T2:       while.cond5.preheader:
 ; CHECK-T2-NEXT:    [[COUNT_01084:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_END:%.*]] ], [ 1, [[WHILE_COND5_PREHEADER_PREHEADER]] ]
@@ -197,11 +201,11 @@ define dso_local arm_aapcscc void @arm_conv_fast_q15(ptr %pSrcA, i32 %srcALen, p
 ; CHECK-T2-NEXT:    [[PY_11076:%.*]] = phi ptr [ [[INCDEC_PTR8:%.*]], [[WHILE_BODY7]] ], [ [[PY_01082]], [[WHILE_COND5_PREHEADER]] ]
 ; CHECK-T2-NEXT:    [[PX_11075:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_BODY7]] ], [ [[PSRCB_PSRCA]], [[WHILE_COND5_PREHEADER]] ]
 ; CHECK-T2-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i16, ptr [[PX_11075]], i32 1
-; CHECK-T2-NEXT:    [[TMP0:%.*]] = load i16, ptr [[PX_11075]], align 2
-; CHECK-T2-NEXT:    [[CONV:%.*]] = sext i16 [[TMP0]] to i32
+; CHECK-T2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[PX_11075]], align 2
+; CHECK-T2-NEXT:    [[CONV:%.*]] = sext i16 [[TMP2]] to i32
 ; CHECK-T2-NEXT:    [[INCDEC_PTR8]] = getelementptr inbounds i16, ptr [[PY_11076]], i32 -1
-; CHECK-T2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[PY_11076]], align 2
-; CHECK-T2-NEXT:    [[CONV9:%.*]] = sext i16 [[TMP1]] to i32
+; CHECK-T2-NEXT:    [[TMP3:%.*]] = load i16, ptr [[PY_11076]], align 2
+; CHECK-T2-NEXT:    [[CONV9:%.*]] = sext i16 [[TMP3]] to i32
 ; CHECK-T2-NEXT:    [[MUL_I:%.*]] = mul nsw i32 [[CONV9]], [[CONV]]
 ; CHECK-T2-NEXT:    [[SHR3_I:%.*]] = ashr i32 [[CONV]], 16
 ; CHECK-T2-NEXT:    [[SHR4_I:%.*]] = ashr i32 [[CONV9]], 16
@@ -213,17 +217,15 @@ define dso_local arm_aapcscc void @arm_conv_fast_q15(ptr %pSrcA, i32 %srcALen, p
 ; CHECK-T2-NEXT:    br i1 [[CMP6]], label [[WHILE_END]], label [[WHILE_BODY7]]
 ; CHECK-T2:       while.end:
 ; CHECK-T2-NEXT:    [[ADD6_I_LCSSA:%.*]] = phi i32 [ [[ADD6_I]], [[WHILE_BODY7]] ]
-; CHECK-T2-NEXT:    [[TMP2:%.*]] = lshr i32 [[ADD6_I_LCSSA]], 15
-; CHECK-T2-NEXT:    [[CONV10:%.*]] = trunc i32 [[TMP2]] to i16
+; CHECK-T2-NEXT:    [[TMP4:%.*]] = lshr i32 [[ADD6_I_LCSSA]], 15
+; CHECK-T2-NEXT:    [[CONV10:%.*]] = trunc i32 [[TMP4]] to i16
 ; CHECK-T2-NEXT:    [[INCDEC_PTR11]] = getelementptr inbounds i16, ptr [[POUT_01081]], i32 1
 ; CHECK-T2-NEXT:    store i16 [[CONV10]], ptr [[POUT_01081]], align 2
 ; CHECK-T2-NEXT:    [[ADD_PTR]] = getelementptr inbounds i16, ptr [[PSRCA_PSRCB]], i32 [[COUNT_01084]]
 ; CHECK-T2-NEXT:    [[INC]] = add nuw nsw i32 [[COUNT_01084]], 1
 ; CHECK-T2-NEXT:    [[DEC12]] = add i32 [[BLOCKSIZE1_01083]], -1
-; CHECK-T2-NEXT:    [[CMP3:%.*]] = icmp ult i32 [[COUNT_01084]], 3
-; CHECK-T2-NEXT:    [[CMP4:%.*]] = icmp ne i32 [[DEC12]], 0
-; CHECK-T2-NEXT:    [[TMP3:%.*]] = and i1 [[CMP4]], [[CMP3]]
-; CHECK-T2-NEXT:    br i1 [[TMP3]], label [[WHILE_COND5_PREHEADER]], label [[WHILE_END13_LOOPEXIT:%.*]]
+; CHECK-T2-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[INC]], [[TMP1]]
+; CHECK-T2-NEXT:    br i1 [[EXITCOND]], label [[WHILE_COND5_PREHEADER]], label [[WHILE_END13_LOOPEXIT:%.*]]
 ; CHECK-T2:       while.end13.loopexit:
 ; CHECK-T2-NEXT:    [[INCDEC_PTR11_LCSSA:%.*]] = phi ptr [ [[INCDEC_PTR11]], [[WHILE_END]] ]
 ; CHECK-T2-NEXT:    [[ADD_PTR_LCSSA:%.*]] = phi ptr [ [[ADD_PTR]], [[WHILE_END]] ]
@@ -256,34 +258,34 @@ define dso_local arm_aapcscc void @arm_conv_fast_q15(ptr %pSrcA, i32 %srcALen, p
 ; CHECK-T2-NEXT:    [[PY_31056:%.*]] = phi ptr [ [[ADD_PTR_I884:%.*]], [[WHILE_BODY23]] ], [ [[PY_21070]], [[WHILE_BODY23_PREHEADER]] ]
 ; CHECK-T2-NEXT:    [[PX_31055:%.*]] = phi ptr [ [[ADD_PTR_I890:%.*]], [[WHILE_BODY23]] ], [ [[PSRCB_PSRCA]], [[WHILE_BODY23_PREHEADER]] ]
 ; CHECK-T2-NEXT:    [[ARRAYIDX_I907:%.*]] = getelementptr inbounds i16, ptr [[PX_31055]], i32 1
-; CHECK-T2-NEXT:    [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_I907]], align 2
-; CHECK-T2-NEXT:    [[TMP5:%.*]] = load i16, ptr [[PX_31055]], align 2
+; CHECK-T2-NEXT:    [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX_I907]], align 2
+; CHECK-T2-NEXT:    [[TMP6:%.*]] = load i16, ptr [[PX_31055]], align 2
 ; CHECK-T2-NEXT:    [[ADD_PTR_I912:%.*]] = getelementptr inbounds i16, ptr [[PX_31055]], i32 2
 ; CHECK-T2-NEXT:    [[ARRAYIDX_I901:%.*]] = getelementptr inbounds i16, ptr [[PY_31056]], i32 1
-; CHECK-T2-NEXT:    [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_I901]], align 2
-; CHECK-T2-NEXT:    [[TMP7:%.*]] = load i16, ptr [[PY_31056]], align 2
+; CHECK-T2-NEXT:    [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX_I901]], align 2
+; CHECK-T2-NEXT:    [[TMP8:%.*]] = load i16, ptr [[PY_31056]], align 2
 ; CHECK-T2-NEXT:    [[ADD_PTR_I906:%.*]] = getelementptr inbounds i16, ptr [[PY_31056]], i32 -2
-; CHECK-T2-NEXT:    [[SHR_I892:%.*]] = sext i16 [[TMP5]] to i32
-; CHECK-T2-NEXT:    [[SHR1_I893:%.*]] = sext i16 [[TMP6]] to i32
+; CHECK-T2-NEXT:    [[SHR_I892:%.*]] = sext i16 [[TMP6]] to i32
+; CHECK-T2-NEXT:    [[SHR1_I893:%.*]] = sext i16 [[TMP7]] to i32
 ; CHECK-T2-NEXT:    [[MUL_I894:%.*]] = mul nsw i32 [[SHR1_I893]], [[SHR_I892]]
-; CHECK-T2-NEXT:    [[SHR2_I895:%.*]] = sext i16 [[TMP4]] to i32
-; CHECK-T2-NEXT:    [[SHR4_I897:%.*]] = sext i16 [[TMP7]] to i32
+; CHECK-T2-NEXT:    [[SHR2_I895:%.*]] = sext i16 [[TMP5]] to i32
+; CHECK-T2-NEXT:    [[SHR4_I897:%.*]] = sext i16 [[TMP8]] to i32
 ; CHECK-T2-NEXT:    [[MUL5_I898:%.*]] = mul nsw i32 [[SHR4_I897]], [[SHR2_I895]]
 ; CHECK-T2-NEXT:    [[ADD_I899:%.*]] = add i32 [[MUL_I894]], [[SUM_11057]]
 ; CHECK-T2-NEXT:    [[ADD6_I900:%.*]] = add i32 [[ADD_I899]], [[MUL5_I898]]
 ; CHECK-T2-NEXT:    [[ARRAYIDX_I885:%.*]] = getelementptr inbounds i16, ptr [[PX_31055]], i32 3
-; CHECK-T2-NEXT:    [[TMP8:%.*]] = load i16, ptr [[ARRAYIDX_I885]], align 2
-; CHECK-T2-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ADD_PTR_I912]], align 2
+; CHECK-T2-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX_I885]], align 2
+; CHECK-T2-NEXT:    [[TMP10:%.*]] = load i16, ptr [[ADD_PTR_I912]], align 2
 ; CHECK-T2-NEXT:    [[ADD_PTR_I890]] = getelementptr inbounds i16, ptr [[PX_31055]], i32 4
 ; CHECK-T2-NEXT:    [[ARRAYIDX_I879:%.*]] = getelementptr inbounds i16, ptr [[PY_31056]], i32 -1
-; CHECK-T2-NEXT:    [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX_I879]], align 2
-; CHECK-T2-NEXT:    [[TMP11:%.*]] = load i16, ptr [[ADD_PTR_I906]], align 2
+; CHECK-T2-NEXT:    [[TMP11:%.*]] = load i16, ptr [[ARRAYIDX_I879]], align 2
+; CHECK-T2-NEXT:    [[TMP12:%.*]] = load i16, ptr [[ADD_PTR_I906]], align 2
 ; CHECK-T2-NEXT:    [[ADD_PTR_I884]] = getelementptr inbounds i16, ptr [[PY_31056]], i32 -4
-; CHECK-T2-NEXT:    [[SHR_I870:%.*]] = sext i16 [[TMP9]] to i32
-; CHECK-T2-NEXT:    [[SHR1_I871:%.*]] = sext i16 [[TMP10]] to i32
+; CHECK-T2-NEXT:    [[SHR_I870:%.*]] = sext i16 [[TMP10]] to i32
+; CHECK-T2-NEXT:    [[SHR1_I871:%.*]] = sext i16 [[TMP11]] to i32
 ; CHECK-T2-NEXT:    [[MUL_I872:%.*]] = mul nsw i32 [[SHR1_I871]], [[SHR_I870]]
-; CHECK-T2-NEXT:    [[SHR2_I873:%.*]] = sext i16 [[TMP8]] to i32
-; CHECK-T2-NEXT:    [[SHR4_I875:%.*]] = sext i16 [[TMP11]] to i32
+; CHECK-T2-NEXT:    [[SHR2_I873:%.*]] = sext i16 [[TMP9]] to i32
+; CHECK-T2-NEXT:    [[SHR4_I875:%.*]] = sext i16 [[TMP12]] to i32
 ; CHECK-T2-NEXT:    [[MUL5_I876:%.*]] = mul nsw i32 [[SHR4_I875]], [[SHR2_I873]]
 ; CHECK-T2-NEXT:    [[ADD_I877:%.*]] = add i32 [[ADD6_I900]], [[MUL_I872]]
 ; CHECK-T2-NEXT:    [[ADD6_I878]] = add i32 [[ADD_I877]], [[MUL5_I876]]
@@ -311,11 +313,11 @@ define dso_local arm_aapcscc void @arm_conv_fast_q15(ptr %pSrcA, i32 %srcALen, p
 ; CHECK-T2-NEXT:    [[PY_41064:%.*]] = phi ptr [ [[INCDEC_PTR39:%.*]], [[WHILE_BODY36]] ], [ [[ADD_PTR32]], [[WHILE_BODY36_PREHEADER]] ]
 ; CHECK-T2-NEXT:    [[PX_41063:%.*]] = phi ptr [ [[INCDEC_PTR37:%.*]], [[WHILE_BODY36]] ], [ [[PX_3_LCSSA]], [[WHILE_BODY36_PREHEADER]] ]
 ; CHECK-T2-NEXT:    [[INCDEC_PTR37]] = getelementptr inbounds i16, ptr [[PX_41063]], i32 1
-; CHECK-T2-NEXT:    [[TMP12:%.*]] = load i16, ptr [[PX_41063]], align 2
-; CHECK-T2-NEXT:    [[CONV38:%.*]] = sext i16 [[TMP12]] to i32
+; CHECK-T2-NEXT:    [[TMP13:%.*]] = load i16, ptr [[PX_41063]], align 2
+; CHECK-T2-NEXT:    [[CONV38:%.*]] = sext i16 [[TMP13]] to i32
 ; CHECK-T2-NEXT:    [[INCDEC_PTR39]] = getelementptr inbounds i16, ptr [[PY_41064]], i32 -1
-; CHECK-T2-NEXT:    [[TMP13:%.*]] = load i16, ptr [[PY_41064]], align 2
-; CHECK-T2-NEXT:    [[CONV40:%.*]] = sext i16 [[TMP13]] to i32
+; CHECK-T2-NEXT:    [[TMP14:%.*]] = load i16, ptr [[PY_41064]], align 2
+; CHECK-T2-NEXT:    [[CONV40:%.*]] = sext i16 [[TMP14]] to i32
 ; CHECK-T2-NEXT:    [[MUL_I863:%.*]] = mul nsw i32 [[CONV40]], [[CONV38]]
 ; CHECK-T2-NEXT:    [[SHR3_I864:%.*]] = ashr i32 [[CONV38]], 16
 ; CHECK-T2-NEXT:    [[SHR4_I865:%.*]] = ashr i32 [[CONV40]], 16
@@ -330,8 +332,8 @@ define dso_local arm_aapcscc void @arm_conv_fast_q15(ptr %pSrcA, i32 %srcALen, p
 ; CHECK-T2-NEXT:    br label [[WHILE_END43]]
 ; CHECK-T2:       while.end43:
 ; CHECK-T2-NEXT:    [[SUM_2_LCSSA:%.*]] = phi i32 [ [[SUM_1_LCSSA]], [[WHILE_END31]] ], [ [[ADD6_I868_LCSSA]], [[WHILE_END43_LOOPEXIT]] ]
-; CHECK-T2-NEXT:    [[TMP14:%.*]] = lshr i32 [[SUM_2_LCSSA]], 15
-; CHECK-T2-NEXT:    [[CONV45:%.*]] = trunc i32 [[TMP14]] to i16
+; CHECK-T2-NEXT:    [[TMP15:%.*]] = lshr i32 [[SUM_2_LCSSA]], 15
+; CHECK-T2-NEXT:    [[CONV45:%.*]] = trunc i32 [[TMP15]] to i16
 ; CHECK-T2-NEXT:    [[INCDEC_PTR46]] = getelementptr inbounds i16, ptr [[POUT_11069]], i32 1
 ; CHECK-T2-NEXT:    store i16 [[CONV45]], ptr [[POUT_11069]], align 2
 ; CHECK-T2-NEXT:    [[SUB47:%.*]] = add i32 [[COUNT_11072]], -1

diff  --git a/llvm/test/Transforms/InferFunctionAttrs/annotate.ll b/llvm/test/Transforms/InferFunctionAttrs/annotate.ll
index f70a2962b9394b8..456155d7e4437db 100644
--- a/llvm/test/Transforms/InferFunctionAttrs/annotate.ll
+++ b/llvm/test/Transforms/InferFunctionAttrs/annotate.ll
@@ -2,7 +2,7 @@
 ; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -passes=inferattrs -S | FileCheck --match-full-lines --check-prefixes=CHECK,CHECK-KNOWN,CHECK-NOLINUX,CHECK-OPEN,CHECK-DARWIN %s
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes=inferattrs -S | FileCheck --match-full-lines --check-prefixes=CHECK,CHECK-KNOWN,CHECK-LINUX %s
 ; RUN: opt < %s -mtriple=nvptx -passes=inferattrs -S | FileCheck --match-full-lines --check-prefixes=CHECK-NOLINUX,CHECK-NVPTX %s
-; RUN: opt < %s -mtriple=powerpc-ibm-aix-xcoff -passes=inferattrs -S | FileCheck --match-full-lines --check-prefixes=CHECK-AIX %s
+; RUN: opt < %s -mtriple=powerpc64-ibm-aix-xcoff -passes=inferattrs -S | FileCheck --match-full-lines --check-prefixes=CHECK-NOLINUX,CHECK-AIX %s
 
 declare i32 @__nvvm_reflect(ptr)
 ; CHECK-NVPTX: declare noundef i32 @__nvvm_reflect(ptr noundef) [[NOFREE_NOUNWIND_READNONE:#[0-9]+]]

diff  --git a/llvm/test/Transforms/InstCombine/double-float-shrink-2.ll b/llvm/test/Transforms/InstCombine/double-float-shrink-2.ll
index 24ccad4e157b936..baa78f82146f97c 100644
--- a/llvm/test/Transforms/InstCombine/double-float-shrink-2.ll
+++ b/llvm/test/Transforms/InstCombine/double-float-shrink-2.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=instcombine -S -mtriple "i386-pc-linux"     | FileCheck %s
-; RUN: opt < %s -passes=instcombine -S -mtriple "i386-pc-win32"     | FileCheck %s
-; RUN: opt < %s -passes=instcombine -S -mtriple "x86_64-pc-win32"   | FileCheck %s
-; RUN: opt < %s -passes=instcombine -S -mtriple "i386-pc-mingw32"   | FileCheck %s
-; RUN: opt < %s -passes=instcombine -S -mtriple "x86_64-pc-mingw32" | FileCheck %s
-; RUN: opt < %s -passes=instcombine -S -mtriple "sparc-sun-solaris" | FileCheck %s
+; RUN: opt < %s -passes=instcombine -S -mtriple "i386-pc-linux"     | FileCheck %s --check-prefixes=CHECK,DOUBLE-4BYTE-ALIGN
+; RUN: opt < %s -passes=instcombine -S -mtriple "i386-pc-win32"     | FileCheck %s --check-prefixes=CHECK,DOUBLE-8BYTE-ALIGN
+; RUN: opt < %s -passes=instcombine -S -mtriple "x86_64-pc-win32"   | FileCheck %s --check-prefixes=CHECK,DOUBLE-8BYTE-ALIGN
+; RUN: opt < %s -passes=instcombine -S -mtriple "i386-pc-mingw32"   | FileCheck %s --check-prefixes=CHECK,DOUBLE-8BYTE-ALIGN
+; RUN: opt < %s -passes=instcombine -S -mtriple "x86_64-pc-mingw32" | FileCheck %s --check-prefixes=CHECK,DOUBLE-8BYTE-ALIGN
+; RUN: opt < %s -passes=instcombine -S -mtriple "sparc-sun-solaris" | FileCheck %s --check-prefixes=CHECK,DOUBLE-8BYTE-ALIGN
 ; RUN: opt < %s -passes=instcombine -S -mtriple "x86_64-pc-win32" -enable-debugify 2>&1 | FileCheck --check-prefix=DBG-VALID %s
 
 declare double @floor(double)
@@ -708,12 +708,19 @@ define float @test_shrink_intrin_fabs_fast_fp16_src(half %C) {
 }
 
 define float @test_no_shrink_intrin_floor_multi_use_fpext(half %C) {
-; CHECK-LABEL: @test_no_shrink_intrin_floor_multi_use_fpext(
-; CHECK-NEXT:    [[D:%.*]] = fpext half [[C:%.*]] to double
-; CHECK-NEXT:    store volatile double [[D]], ptr undef, align 8
-; CHECK-NEXT:    [[E:%.*]] = call double @llvm.floor.f64(double [[D]])
-; CHECK-NEXT:    [[F:%.*]] = fptrunc double [[E]] to float
-; CHECK-NEXT:    ret float [[F]]
+; DOUBLE-4BYTE-ALIGN-LABEL: @test_no_shrink_intrin_floor_multi_use_fpext(
+; DOUBLE-4BYTE-ALIGN-NEXT:    [[D:%.*]] = fpext half [[C:%.*]] to double
+; DOUBLE-4BYTE-ALIGN-NEXT:    store volatile double [[D]], ptr undef, align 4
+; DOUBLE-4BYTE-ALIGN-NEXT:    [[E:%.*]] = call double @llvm.floor.f64(double [[D]])
+; DOUBLE-4BYTE-ALIGN-NEXT:    [[F:%.*]] = fptrunc double [[E]] to float
+; DOUBLE-4BYTE-ALIGN-NEXT:    ret float [[F]]
+;
+; DOUBLE-8BYTE-ALIGN-LABEL: @test_no_shrink_intrin_floor_multi_use_fpext(
+; DOUBLE-8BYTE-ALIGN-NEXT:    [[D:%.*]] = fpext half [[C:%.*]] to double
+; DOUBLE-8BYTE-ALIGN-NEXT:    store volatile double [[D]], ptr undef, align 8
+; DOUBLE-8BYTE-ALIGN-NEXT:    [[E:%.*]] = call double @llvm.floor.f64(double [[D]])
+; DOUBLE-8BYTE-ALIGN-NEXT:    [[F:%.*]] = fptrunc double [[E]] to float
+; DOUBLE-8BYTE-ALIGN-NEXT:    ret float [[F]]
 ;
   %D = fpext half %C to double
   store volatile double %D, ptr undef
@@ -723,12 +730,19 @@ define float @test_no_shrink_intrin_floor_multi_use_fpext(half %C) {
 }
 
 define float @test_no_shrink_intrin_fabs_multi_use_fpext(half %C) {
-; CHECK-LABEL: @test_no_shrink_intrin_fabs_multi_use_fpext(
-; CHECK-NEXT:    [[D:%.*]] = fpext half [[C:%.*]] to double
-; CHECK-NEXT:    store volatile double [[D]], ptr undef, align 8
-; CHECK-NEXT:    [[E:%.*]] = call double @llvm.fabs.f64(double [[D]])
-; CHECK-NEXT:    [[F:%.*]] = fptrunc double [[E]] to float
-; CHECK-NEXT:    ret float [[F]]
+; DOUBLE-4BYTE-ALIGN-LABEL: @test_no_shrink_intrin_fabs_multi_use_fpext(
+; DOUBLE-4BYTE-ALIGN-NEXT:    [[D:%.*]] = fpext half [[C:%.*]] to double
+; DOUBLE-4BYTE-ALIGN-NEXT:    store volatile double [[D]], ptr undef, align 4
+; DOUBLE-4BYTE-ALIGN-NEXT:    [[E:%.*]] = call double @llvm.fabs.f64(double [[D]])
+; DOUBLE-4BYTE-ALIGN-NEXT:    [[F:%.*]] = fptrunc double [[E]] to float
+; DOUBLE-4BYTE-ALIGN-NEXT:    ret float [[F]]
+;
+; DOUBLE-8BYTE-ALIGN-LABEL: @test_no_shrink_intrin_fabs_multi_use_fpext(
+; DOUBLE-8BYTE-ALIGN-NEXT:    [[D:%.*]] = fpext half [[C:%.*]] to double
+; DOUBLE-8BYTE-ALIGN-NEXT:    store volatile double [[D]], ptr undef, align 8
+; DOUBLE-8BYTE-ALIGN-NEXT:    [[E:%.*]] = call double @llvm.fabs.f64(double [[D]])
+; DOUBLE-8BYTE-ALIGN-NEXT:    [[F:%.*]] = fptrunc double [[E]] to float
+; DOUBLE-8BYTE-ALIGN-NEXT:    ret float [[F]]
 ;
   %D = fpext half %C to double
   store volatile double %D, ptr undef

diff  --git a/llvm/test/Transforms/InstCombine/ffs-i16.ll b/llvm/test/Transforms/InstCombine/ffs-i16.ll
index f567a9488d4985b..05c21fce5f185cc 100644
--- a/llvm/test/Transforms/InstCombine/ffs-i16.ll
+++ b/llvm/test/Transforms/InstCombine/ffs-i16.ll
@@ -13,13 +13,13 @@ declare void @sink(i16)
 
 define void @fold_ffs(i16 %x) {
 ; AVR-LABEL: @fold_ffs(
-; AVR-NEXT:    call void @sink(i16 0)
-; AVR-NEXT:    call void @sink(i16 1)
-; AVR-NEXT:    [[CTTZ:%.*]] = call i16 @llvm.cttz.i16(i16 [[X:%.*]], i1 true), !range [[RNG0:![0-9]+]]
+; AVR-NEXT:    call addrspace(1) void @sink(i16 0)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 1)
+; AVR-NEXT:    [[CTTZ:%.*]] = call addrspace(1) i16 @llvm.cttz.i16(i16 [[X:%.*]], i1 true), !range [[RNG0:![0-9]+]]
 ; AVR-NEXT:    [[TMP1:%.*]] = add nuw nsw i16 [[CTTZ]], 1
 ; AVR-NEXT:    [[DOTNOT:%.*]] = icmp eq i16 [[X]], 0
 ; AVR-NEXT:    [[NX:%.*]] = select i1 [[DOTNOT]], i16 0, i16 [[TMP1]]
-; AVR-NEXT:    call void @sink(i16 [[NX]])
+; AVR-NEXT:    call addrspace(1) void @sink(i16 [[NX]])
 ; AVR-NEXT:    ret void
 ;
 ; MSP430-LABEL: @fold_ffs(

diff  --git a/llvm/test/Transforms/InstCombine/fls-i16.ll b/llvm/test/Transforms/InstCombine/fls-i16.ll
index ba0471665a48214..6d939b4e5c4e432 100644
--- a/llvm/test/Transforms/InstCombine/fls-i16.ll
+++ b/llvm/test/Transforms/InstCombine/fls-i16.ll
@@ -14,11 +14,11 @@ declare void @sink(i16)
 
 define void @fold_fls(i16 %x) {
 ; AVR-LABEL: @fold_fls(
-; AVR-NEXT:    call void @sink(i16 0)
-; AVR-NEXT:    call void @sink(i16 1)
-; AVR-NEXT:    [[CTLZ:%.*]] = call i16 @llvm.ctlz.i16(i16 [[X:%.*]], i1 false), !range [[RNG0:![0-9]+]]
+; AVR-NEXT:    call addrspace(1) void @sink(i16 0)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 1)
+; AVR-NEXT:    [[CTLZ:%.*]] = call addrspace(1) i16 @llvm.ctlz.i16(i16 [[X:%.*]], i1 false), !range [[RNG0:![0-9]+]]
 ; AVR-NEXT:    [[NX:%.*]] = sub nuw nsw i16 16, [[CTLZ]]
-; AVR-NEXT:    call void @sink(i16 [[NX]])
+; AVR-NEXT:    call addrspace(1) void @sink(i16 [[NX]])
 ; AVR-NEXT:    ret void
 ;
 ; MSP430-LABEL: @fold_fls(

diff  --git a/llvm/test/Transforms/InstCombine/isascii-i16.ll b/llvm/test/Transforms/InstCombine/isascii-i16.ll
index 2cac57e09c37eb9..c5a9384a1ba39b0 100644
--- a/llvm/test/Transforms/InstCombine/isascii-i16.ll
+++ b/llvm/test/Transforms/InstCombine/isascii-i16.ll
@@ -12,17 +12,17 @@ declare void @sink(i16)
 
 define void @fold_isascii(i16 %c) {
 ; AVR-LABEL: @fold_isascii(
-; AVR-NEXT:    call void @sink(i16 1)
-; AVR-NEXT:    call void @sink(i16 1)
-; AVR-NEXT:    call void @sink(i16 1)
-; AVR-NEXT:    call void @sink(i16 0)
-; AVR-NEXT:    call void @sink(i16 0)
-; AVR-NEXT:    call void @sink(i16 0)
-; AVR-NEXT:    call void @sink(i16 0)
-; AVR-NEXT:    call void @sink(i16 0)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 1)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 1)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 1)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 0)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 0)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 0)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 0)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 0)
 ; AVR-NEXT:    [[ISASCII:%.*]] = icmp ult i16 [[C:%.*]], 128
 ; AVR-NEXT:    [[IC:%.*]] = zext i1 [[ISASCII]] to i16
-; AVR-NEXT:    call void @sink(i16 [[IC]])
+; AVR-NEXT:    call addrspace(1) void @sink(i16 [[IC]])
 ; AVR-NEXT:    ret void
 ;
 ; MSP430-LABEL: @fold_isascii(

diff  --git a/llvm/test/Transforms/InstCombine/isdigit-i16.ll b/llvm/test/Transforms/InstCombine/isdigit-i16.ll
index 69f7b7f2e1f702e..fd02fcef798cff5 100644
--- a/llvm/test/Transforms/InstCombine/isdigit-i16.ll
+++ b/llvm/test/Transforms/InstCombine/isdigit-i16.ll
@@ -11,22 +11,22 @@ declare void @sink(i16)
 
 define void @fold_isdigit(i16 %c) {
 ; AVR-LABEL: @fold_isdigit(
-; AVR-NEXT:    call void @sink(i16 0)
-; AVR-NEXT:    call void @sink(i16 0)
-; AVR-NEXT:    call void @sink(i16 0)
-; AVR-NEXT:    call void @sink(i16 1)
-; AVR-NEXT:    call void @sink(i16 1)
-; AVR-NEXT:    call void @sink(i16 1)
-; AVR-NEXT:    call void @sink(i16 0)
-; AVR-NEXT:    call void @sink(i16 0)
-; AVR-NEXT:    call void @sink(i16 0)
-; AVR-NEXT:    call void @sink(i16 0)
-; AVR-NEXT:    call void @sink(i16 0)
-; AVR-NEXT:    call void @sink(i16 0)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 0)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 0)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 0)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 1)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 1)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 1)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 0)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 0)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 0)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 0)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 0)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 0)
 ; AVR-NEXT:    [[ISDIGITTMP:%.*]] = add i16 [[C:%.*]], -48
 ; AVR-NEXT:    [[ISDIGIT:%.*]] = icmp ult i16 [[ISDIGITTMP]], 10
 ; AVR-NEXT:    [[IC:%.*]] = zext i1 [[ISDIGIT]] to i16
-; AVR-NEXT:    call void @sink(i16 [[IC]])
+; AVR-NEXT:    call addrspace(1) void @sink(i16 [[IC]])
 ; AVR-NEXT:    ret void
 ;
 ; MSP430-LABEL: @fold_isdigit(

diff  --git a/llvm/test/Transforms/InstCombine/pow-4.ll b/llvm/test/Transforms/InstCombine/pow-4.ll
index dbe6477c0ecaaf3..09ce645130fa82f 100644
--- a/llvm/test/Transforms/InstCombine/pow-4.ll
+++ b/llvm/test/Transforms/InstCombine/pow-4.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=instcombine -S < %s -mtriple unknown                        | FileCheck %s --check-prefixes=CHECK,CHECKI32,CHECKSQRT
-; RUN: opt -passes=instcombine -S < %s -mtriple unknown -disable-builtin sqrt  | FileCheck %s --check-prefixes=CHECK,CHECKI32,CHECKNOSQRT
+; RUN: opt -passes=instcombine -S < %s -mtriple unknown -data-layout=e          | FileCheck %s --check-prefixes=CHECK,CHECKI32,CHECKSQRT
+; RUN: opt -passes=instcombine -S < %s -mtriple unknown -data-layout=e -disable-builtin sqrt  | FileCheck %s --check-prefixes=CHECK,CHECKI32,CHECKNOSQRT
 ; RUN: opt -passes=instcombine -S < %s -mtriple msp430                        | FileCheck %s --check-prefixes=CHECK,CHECKI16,CHECKSQRT
 ; RUN: opt -passes=instcombine -S < %s -mtriple msp430 -disable-builtin sqrt  | FileCheck %s --check-prefixes=CHECK,CHECKI16,CHECKNOSQRT
 

diff  --git a/llvm/test/Transforms/InstCombine/pow_fp_int.ll b/llvm/test/Transforms/InstCombine/pow_fp_int.ll
index e4d74f6bacb06a9..9c1fa88f3183ea2 100644
--- a/llvm/test/Transforms/InstCombine/pow_fp_int.ll
+++ b/llvm/test/Transforms/InstCombine/pow_fp_int.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
 ; RUN: opt -mtriple unknown -passes=instcombine -S < %s | FileCheck %s
+target datalayout = "e"
 
 ; PR42190
 ; Can't generate test checks due to PR42740.

diff  --git a/llvm/test/Transforms/InstCombine/printf-i16.ll b/llvm/test/Transforms/InstCombine/printf-i16.ll
index f9cc4ebfbcc3428..2177eda3aecafc5 100644
--- a/llvm/test/Transforms/InstCombine/printf-i16.ll
+++ b/llvm/test/Transforms/InstCombine/printf-i16.ll
@@ -24,21 +24,21 @@ declare i16 @printf(ptr, ...)
 
 define void @xform_printf(i8 %c8, i16 %c16) {
 ; AVR-LABEL: @xform_printf(
-; AVR-NEXT:    [[PUTCHAR:%.*]] = call i16 @putchar(i16 1)
-; AVR-NEXT:    [[PUTCHAR1:%.*]] = call i16 @putchar(i16 1)
-; AVR-NEXT:    [[PUTCHAR2:%.*]] = call i16 @putchar(i16 1)
-; AVR-NEXT:    [[PUTCHAR3:%.*]] = call i16 @putchar(i16 127)
-; AVR-NEXT:    [[PUTCHAR4:%.*]] = call i16 @putchar(i16 127)
-; AVR-NEXT:    [[PUTCHAR5:%.*]] = call i16 @putchar(i16 127)
-; AVR-NEXT:    [[PUTCHAR6:%.*]] = call i16 @putchar(i16 128)
-; AVR-NEXT:    [[PUTCHAR7:%.*]] = call i16 @putchar(i16 128)
-; AVR-NEXT:    [[PUTCHAR8:%.*]] = call i16 @putchar(i16 128)
-; AVR-NEXT:    [[PUTCHAR9:%.*]] = call i16 @putchar(i16 255)
-; AVR-NEXT:    [[PUTCHAR10:%.*]] = call i16 @putchar(i16 255)
-; AVR-NEXT:    [[PUTCHAR11:%.*]] = call i16 @putchar(i16 255)
+; AVR-NEXT:    [[PUTCHAR:%.*]] = call addrspace(1) i16 @putchar(i16 1)
+; AVR-NEXT:    [[PUTCHAR1:%.*]] = call addrspace(1) i16 @putchar(i16 1)
+; AVR-NEXT:    [[PUTCHAR2:%.*]] = call addrspace(1) i16 @putchar(i16 1)
+; AVR-NEXT:    [[PUTCHAR3:%.*]] = call addrspace(1) i16 @putchar(i16 127)
+; AVR-NEXT:    [[PUTCHAR4:%.*]] = call addrspace(1) i16 @putchar(i16 127)
+; AVR-NEXT:    [[PUTCHAR5:%.*]] = call addrspace(1) i16 @putchar(i16 127)
+; AVR-NEXT:    [[PUTCHAR6:%.*]] = call addrspace(1) i16 @putchar(i16 128)
+; AVR-NEXT:    [[PUTCHAR7:%.*]] = call addrspace(1) i16 @putchar(i16 128)
+; AVR-NEXT:    [[PUTCHAR8:%.*]] = call addrspace(1) i16 @putchar(i16 128)
+; AVR-NEXT:    [[PUTCHAR9:%.*]] = call addrspace(1) i16 @putchar(i16 255)
+; AVR-NEXT:    [[PUTCHAR10:%.*]] = call addrspace(1) i16 @putchar(i16 255)
+; AVR-NEXT:    [[PUTCHAR11:%.*]] = call addrspace(1) i16 @putchar(i16 255)
 ; AVR-NEXT:    [[TMP1:%.*]] = zext i8 [[C8:%.*]] to i16
-; AVR-NEXT:    [[PUTCHAR12:%.*]] = call i16 @putchar(i16 [[TMP1]])
-; AVR-NEXT:    [[PUTCHAR13:%.*]] = call i16 @putchar(i16 [[C16:%.*]])
+; AVR-NEXT:    [[PUTCHAR12:%.*]] = call addrspace(1) i16 @putchar(i16 [[TMP1]])
+; AVR-NEXT:    [[PUTCHAR13:%.*]] = call addrspace(1) i16 @putchar(i16 [[C16:%.*]])
 ; AVR-NEXT:    ret void
 ;
 ; MSP430-LABEL: @xform_printf(

diff  --git a/llvm/test/Transforms/InstCombine/puts-i16.ll b/llvm/test/Transforms/InstCombine/puts-i16.ll
index 92769083a8affe5..4ac3cc01ea04363 100644
--- a/llvm/test/Transforms/InstCombine/puts-i16.ll
+++ b/llvm/test/Transforms/InstCombine/puts-i16.ll
@@ -14,7 +14,7 @@ declare i16 @puts(ptr)
 define void @xform_puts(i16 %c) {
 ; Transform puts("") to putchar("\n").
 ; AVR-LABEL: @xform_puts(
-; AVR-NEXT:    [[PUTCHAR:%.*]] = call i16 @putchar(i16 10)
+; AVR-NEXT:    [[PUTCHAR:%.*]] = call addrspace(1) i16 @putchar(i16 10)
 ; AVR-NEXT:    ret void
 ;
 ; MSP430-LABEL: @xform_puts(

diff  --git a/llvm/test/Transforms/InstCombine/sincospi.ll b/llvm/test/Transforms/InstCombine/sincospi.ll
index ef65e0b68c76982..c4aee306a03c792 100644
--- a/llvm/test/Transforms/InstCombine/sincospi.ll
+++ b/llvm/test/Transforms/InstCombine/sincospi.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -passes=instcombine -S < %s -mtriple=x86_64-apple-macosx10.9 | FileCheck %s --check-prefix=CHECK-FLOAT-IN-VEC
-; RUN: opt -passes=instcombine -S < %s -mtriple=arm-apple-ios7.0 | FileCheck %s
-; RUN: opt -passes=instcombine -S < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
-; RUN: opt -passes=instcombine -S < %s -mtriple=x86_64-apple-macosx10.8 | FileCheck %s --check-prefix=CHECK-NO-SINCOS
-; RUN: opt -passes=instcombine -S < %s -mtriple=arm-apple-ios6.0 | FileCheck %s --check-prefix=CHECK-NO-SINCOS
-; RUN: opt -passes=instcombine -S < %s -mtriple=x86_64-none-linux-gnu | FileCheck %s --check-prefix=CHECK-NO-SINCOS
+; RUN: opt -passes=instcombine -S < %s -mtriple=arm-apple-ios7.0 | FileCheck %s --check-prefixes=CHECK,CHECK-DOUBLE-ALIGN4
+; RUN: opt -passes=instcombine -S < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s --check-prefixes=CHECK,CHECK-DOUBLE-ALIGN8
+; RUN: opt -passes=instcombine -S < %s -mtriple=x86_64-apple-macosx10.8 | FileCheck %s --check-prefixes=CHECK-NO-SINCOS,CHECK-NO-SINCOS-DOUBLE-ALIGN8
+; RUN: opt -passes=instcombine -S < %s -mtriple=arm-apple-ios6.0 | FileCheck %s --check-prefixes=CHECK-NO-SINCOS,CHECK-NO-SINCOS-DOUBLE-ALIGN4
+; RUN: opt -passes=instcombine -S < %s -mtriple=x86_64-none-linux-gnu | FileCheck %s --check-prefixes=CHECK-NO-SINCOS,CHECK-NO-SINCOS-DOUBLE-ALIGN8
 
 
 attributes #0 = { readnone nounwind }
@@ -129,21 +129,37 @@ define double @test_instbased_f64() {
 ; CHECK-FLOAT-IN-VEC-NEXT:    [[RES:%.*]] = fadd double [[SINPI]], [[COSPI]]
 ; CHECK-FLOAT-IN-VEC-NEXT:    ret double [[RES]]
 ;
-; CHECK-LABEL: @test_instbased_f64(
-; CHECK-NEXT:    [[VAL:%.*]] = load double, ptr @var64, align 8
-; CHECK-NEXT:    [[SINCOSPI:%.*]] = call { double, double } @__sincospi_stret(double [[VAL]])
-; CHECK-NEXT:    [[SINPI:%.*]] = extractvalue { double, double } [[SINCOSPI]], 0
-; CHECK-NEXT:    [[COSPI:%.*]] = extractvalue { double, double } [[SINCOSPI]], 1
-; CHECK-NEXT:    [[COS:%.*]] = call double @__cospi(double [[VAL]]) #[[ATTR0]]
-; CHECK-NEXT:    [[RES:%.*]] = fadd double [[SINPI]], [[COSPI]]
-; CHECK-NEXT:    ret double [[RES]]
+; CHECK-DOUBLE-ALIGN4-LABEL: @test_instbased_f64(
+; CHECK-DOUBLE-ALIGN4-NEXT:    [[VAL:%.*]] = load double, ptr @var64, align 4
+; CHECK-DOUBLE-ALIGN4-NEXT:    [[SINCOSPI:%.*]] = call { double, double } @__sincospi_stret(double [[VAL]])
+; CHECK-DOUBLE-ALIGN4-NEXT:    [[SINPI:%.*]] = extractvalue { double, double } [[SINCOSPI]], 0
+; CHECK-DOUBLE-ALIGN4-NEXT:    [[COSPI:%.*]] = extractvalue { double, double } [[SINCOSPI]], 1
+; CHECK-DOUBLE-ALIGN4-NEXT:    [[COS:%.*]] = call double @__cospi(double [[VAL]]) #[[ATTR0]]
+; CHECK-DOUBLE-ALIGN4-NEXT:    [[RES:%.*]] = fadd double [[SINPI]], [[COSPI]]
+; CHECK-DOUBLE-ALIGN4-NEXT:    ret double [[RES]]
 ;
-; CHECK-NO-SINCOS-LABEL: @test_instbased_f64(
-; CHECK-NO-SINCOS-NEXT:    [[VAL:%.*]] = load double, ptr @var64, align 8
-; CHECK-NO-SINCOS-NEXT:    [[SIN:%.*]] = call double @__sinpi(double [[VAL]]) #[[ATTR0]]
-; CHECK-NO-SINCOS-NEXT:    [[COS:%.*]] = call double @__cospi(double [[VAL]]) #[[ATTR0]]
-; CHECK-NO-SINCOS-NEXT:    [[RES:%.*]] = fadd double [[SIN]], [[COS]]
-; CHECK-NO-SINCOS-NEXT:    ret double [[RES]]
+; CHECK-DOUBLE-ALIGN8-LABEL: @test_instbased_f64(
+; CHECK-DOUBLE-ALIGN8-NEXT:    [[VAL:%.*]] = load double, ptr @var64, align 8
+; CHECK-DOUBLE-ALIGN8-NEXT:    [[SINCOSPI:%.*]] = call { double, double } @__sincospi_stret(double [[VAL]])
+; CHECK-DOUBLE-ALIGN8-NEXT:    [[SINPI:%.*]] = extractvalue { double, double } [[SINCOSPI]], 0
+; CHECK-DOUBLE-ALIGN8-NEXT:    [[COSPI:%.*]] = extractvalue { double, double } [[SINCOSPI]], 1
+; CHECK-DOUBLE-ALIGN8-NEXT:    [[COS:%.*]] = call double @__cospi(double [[VAL]]) #[[ATTR0]]
+; CHECK-DOUBLE-ALIGN8-NEXT:    [[RES:%.*]] = fadd double [[SINPI]], [[COSPI]]
+; CHECK-DOUBLE-ALIGN8-NEXT:    ret double [[RES]]
+;
+; CHECK-NO-SINCOS-DOUBLE-ALIGN8-LABEL: @test_instbased_f64(
+; CHECK-NO-SINCOS-DOUBLE-ALIGN8-NEXT:    [[VAL:%.*]] = load double, ptr @var64, align 8
+; CHECK-NO-SINCOS-DOUBLE-ALIGN8-NEXT:    [[SIN:%.*]] = call double @__sinpi(double [[VAL]]) #[[ATTR0]]
+; CHECK-NO-SINCOS-DOUBLE-ALIGN8-NEXT:    [[COS:%.*]] = call double @__cospi(double [[VAL]]) #[[ATTR0]]
+; CHECK-NO-SINCOS-DOUBLE-ALIGN8-NEXT:    [[RES:%.*]] = fadd double [[SIN]], [[COS]]
+; CHECK-NO-SINCOS-DOUBLE-ALIGN8-NEXT:    ret double [[RES]]
+;
+; CHECK-NO-SINCOS-DOUBLE-ALIGN4-LABEL: @test_instbased_f64(
+; CHECK-NO-SINCOS-DOUBLE-ALIGN4-NEXT:    [[VAL:%.*]] = load double, ptr @var64, align 4
+; CHECK-NO-SINCOS-DOUBLE-ALIGN4-NEXT:    [[SIN:%.*]] = call double @__sinpi(double [[VAL]]) #[[ATTR0]]
+; CHECK-NO-SINCOS-DOUBLE-ALIGN4-NEXT:    [[COS:%.*]] = call double @__cospi(double [[VAL]]) #[[ATTR0]]
+; CHECK-NO-SINCOS-DOUBLE-ALIGN4-NEXT:    [[RES:%.*]] = fadd double [[SIN]], [[COS]]
+; CHECK-NO-SINCOS-DOUBLE-ALIGN4-NEXT:    ret double [[RES]]
 ;
   %val = load double, ptr @var64
   %sin = call double @__sinpi(double %val) #0

diff  --git a/llvm/test/Transforms/InstSimplify/ConstProp/calls-math-finite.ll b/llvm/test/Transforms/InstSimplify/ConstProp/calls-math-finite.ll
index 36a69f3031b19c2..2ec147c2ad79f5f 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/calls-math-finite.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/calls-math-finite.ll
@@ -1,13 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -passes=instsimplify -S | FileCheck %s
-; RUN: opt < %s -passes=instsimplify -S -mtriple=unknown-unknown-linux-musl | FileCheck -check-prefix=MUSL %s
+; RUN: opt < %s -passes=instsimplify -S -mtriple=x86_64-unknown-linux-musl | FileCheck -check-prefix=MUSL %s
 
 ; Test to verify constant folding can occur when math routines are mapped
 ; to the __<func>_finite versions of functions due to __FINITE_MATH_ONLY__
 ; being enabled on headers on Linux. All calls should constant fold away
 ; in this test.
 
-target triple = "unknown-unknown-linux-gnu"
+target triple = "x86_64-unknown-linux-gnu"
 
 declare double @__acos_finite(double) #0
 declare float @__acosf_finite(float) #0

diff  --git a/llvm/test/Transforms/LoopStrengthReduce/X86/2012-01-13-phielim.ll b/llvm/test/Transforms/LoopStrengthReduce/X86/2012-01-13-phielim.ll
index d034e6acf9f8695..32646b4c2b42dca 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/X86/2012-01-13-phielim.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/X86/2012-01-13-phielim.ll
@@ -10,21 +10,23 @@ define i32 @test(ptr %base) nounwind uwtable ssp {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[WHILE_BODY_LR_PH_I:%.*]]
 ; CHECK:       while.body.lr.ph.i:
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 16
 ; CHECK-NEXT:    br label [[WHILE_BODY_I:%.*]]
 ; CHECK:       while.body.i:
 ; CHECK-NEXT:    [[INDVARS_IV7_I:%.*]] = phi i64 [ 16, [[WHILE_BODY_LR_PH_I]] ], [ [[INDVARS_IV_NEXT8_I:%.*]], [[COND_TRUE29_I:%.*]] ]
 ; CHECK-NEXT:    [[I_05_I:%.*]] = phi i64 [ 0, [[WHILE_BODY_LR_PH_I]] ], [ [[INDVARS_IV7_I]], [[COND_TRUE29_I]] ]
+; CHECK-NEXT:    [[LSR4:%.*]] = trunc i64 [[I_05_I]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[LSR4]] to i64
+; CHECK-NEXT:    [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[UGLYGEP]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[SEXT_I:%.*]] = shl i64 [[I_05_I]], 32
 ; CHECK-NEXT:    [[IDX_EXT_I:%.*]] = ashr exact i64 [[SEXT_I]], 32
 ; CHECK-NEXT:    [[ADD_PTR_SUM_I:%.*]] = add i64 [[IDX_EXT_I]], 16
 ; CHECK-NEXT:    br label [[FOR_BODY_I:%.*]]
 ; CHECK:       for.body.i:
-; CHECK-NEXT:    [[INDVARS_IV_I:%.*]] = phi i64 [ 0, [[WHILE_BODY_I]] ], [ [[INDVARS_IV_NEXT_I:%.*]], [[FOR_BODY_I]] ]
-; CHECK-NEXT:    [[ADD_PTR_SUM:%.*]] = add i64 [[ADD_PTR_SUM_I]], [[INDVARS_IV_I]]
-; CHECK-NEXT:    [[ARRAYIDX22_I:%.*]] = getelementptr inbounds i8, ptr [[BASE:%.*]], i64 [[ADD_PTR_SUM]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX22_I]], align 1
-; CHECK-NEXT:    [[INDVARS_IV_NEXT_I]] = add i64 [[INDVARS_IV_I]], 1
+; CHECK-NEXT:    [[LSR_IV2:%.*]] = phi ptr [ [[UGLYGEP3:%.*]], [[FOR_BODY_I]] ], [ [[UGLYGEP1]], [[WHILE_BODY_I]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[LSR_IV2]], align 1
 ; CHECK-NEXT:    [[CMP:%.*]] = call i1 @check() #[[ATTR3:[0-9]+]]
+; CHECK-NEXT:    [[UGLYGEP3]] = getelementptr i8, ptr [[LSR_IV2]], i64 1
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_END_I:%.*]], label [[FOR_BODY_I]]
 ; CHECK:       for.end.i:
 ; CHECK-NEXT:    [[ADD_PTR_I144:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD_PTR_SUM_I]]
@@ -93,18 +95,20 @@ define void @test2(i32 %n) nounwind uwtable {
 ; CHECK:       for.cond468.preheader:
 ; CHECK-NEXT:    br label [[FOR_COND468:%.*]]
 ; CHECK:       for.cond468:
-; CHECK-NEXT:    [[INDVARS_IV1163:%.*]] = phi i64 [ [[INDVARS_IV_NEXT1164:%.*]], [[IF_THEN477:%.*]] ], [ 1, [[FOR_COND468_PREHEADER]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[INDVARS_IV1163]] to i32
-; CHECK-NEXT:    [[CMP469:%.*]] = icmp slt i32 [[TMP0]], [[N:%.*]]
+; CHECK-NEXT:    [[LSR_IV1:%.*]] = phi i32 [ 1, [[FOR_COND468_PREHEADER]] ], [ [[LSR_IV_NEXT:%.*]], [[IF_THEN477:%.*]] ]
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi ptr [ getelementptr inbounds ([5000 x %struct.anon.7.91.199.307.415.475.559.643.751.835.943.1003.1111.1219.1351.1375.1399.1435.1471.1483.1519.1531.1651.1771], ptr @tags, i64 0, i64 0, i32 2), [[FOR_COND468_PREHEADER]] ], [ [[UGLYGEP:%.*]], [[IF_THEN477]] ]
+; CHECK-NEXT:    [[K_0:%.*]] = load i32, ptr [[LSR_IV]], align 4
+; CHECK-NEXT:    [[CMP469:%.*]] = icmp slt i32 [[LSR_IV1]], [[N:%.*]]
 ; CHECK-NEXT:    br i1 [[CMP469]], label [[FOR_BODY471:%.*]], label [[FOR_INC498_PREHEADER:%.*]]
 ; CHECK:       for.body471:
-; CHECK-NEXT:    [[FIRST:%.*]] = getelementptr inbounds [5000 x %struct.anon.7.91.199.307.415.475.559.643.751.835.943.1003.1111.1219.1351.1375.1399.1435.1471.1483.1519.1531.1651.1771], ptr @tags, i64 0, i64 [[INDVARS_IV1163]], i32 1
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[FIRST]], align 4
+; CHECK-NEXT:    [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[LSR_IV]], i64 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[UGLYGEP2]], align 4
 ; CHECK-NEXT:    br i1 false, label [[IF_THEN477]], label [[FOR_INC498_PREHEADER]]
 ; CHECK:       for.inc498.preheader:
 ; CHECK-NEXT:    br label [[FOR_INC498:%.*]]
 ; CHECK:       if.then477:
-; CHECK-NEXT:    [[INDVARS_IV_NEXT1164]] = add i64 [[INDVARS_IV1163]], 1
+; CHECK-NEXT:    [[UGLYGEP]] = getelementptr i8, ptr [[LSR_IV]], i64 12
+; CHECK-NEXT:    [[LSR_IV_NEXT]] = add nuw nsw i32 [[LSR_IV1]], 1
 ; CHECK-NEXT:    br label [[FOR_COND468]]
 ; CHECK:       for.inc498:
 ; CHECK-NEXT:    br label [[FOR_INC498]]
@@ -154,27 +158,27 @@ define fastcc void @test3(double* nocapture %u) nounwind uwtable ssp {
 ; CHECK:       for.body3.us.i:
 ; CHECK-NEXT:    [[INDVARS_IV_I_SV_PHI:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_I:%.*]], [[MESHBB]] ], [ 0, [[FOR_BODY3_LR_PH_US_I:%.*]] ]
 ; CHECK-NEXT:    [[OPQ_SA_CALC12:%.*]] = sub i32 undef, 227
-; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i64 [[INDVARS_IV_I_SV_PHI]], [[INDVARS_IV8_I_SV_PHI26:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
-; CHECK-NEXT:    [[MUL_I_US_I:%.*]] = mul nsw i32 0, [[TMP1]]
-; CHECK-NEXT:    [[ARRAYIDX5_US_I:%.*]] = getelementptr inbounds double, ptr [[U:%.*]], i64 [[INDVARS_IV_I_SV_PHI]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr [[ARRAYIDX5_US_I]], align 8
-; CHECK-NEXT:    [[INDVARS_IV_NEXT_I]] = add i64 [[INDVARS_IV_I_SV_PHI]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[LSR_IV:%.*]], [[INDVARS_IV_I_SV_PHI]]
+; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[TMP0]] to i32
+; CHECK-NEXT:    [[MUL_I_US_I:%.*]] = mul nsw i32 0, [[TMP]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[INDVARS_IV_I_SV_PHI]], 3
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[U:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr [[UGLYGEP]], align 8
 ; CHECK-NEXT:    br i1 undef, label [[FOR_INC8_US_I:%.*]], label [[MESHBB]]
 ; CHECK:       for.body3.lr.ph.us.i.loopexit:
+; CHECK-NEXT:    [[LSR_IV_NEXT:%.*]] = add i64 [[LSR_IV]], 1
 ; CHECK-NEXT:    br label [[FOR_BODY3_LR_PH_US_I]]
 ; CHECK:       for.body3.lr.ph.us.i:
-; CHECK-NEXT:    [[INDVARS_IV8_I_SV_PHI26]] = phi i64 [ undef, [[MESHBB1]] ], [ [[INDVARS_IV8_I_SV_PHI24:%.*]], [[FOR_BODY3_LR_PH_US_I_LOOPEXIT:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX_US_I:%.*]] = getelementptr inbounds double, ptr undef, i64 [[INDVARS_IV8_I_SV_PHI26]]
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDVARS_IV8_I_SV_PHI26]], 1
+; CHECK-NEXT:    [[LSR_IV]] = phi i64 [ [[LSR_IV_NEXT]], [[FOR_BODY3_LR_PH_US_I_LOOPEXIT:%.*]] ], [ undef, [[MESHBB1]] ]
+; CHECK-NEXT:    [[ARRAYIDX_US_I:%.*]] = getelementptr inbounds double, ptr undef, i64 [[LSR_IV]]
 ; CHECK-NEXT:    br label [[FOR_BODY3_US_I:%.*]]
 ; CHECK:       for.inc8.us.i2:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       eval_At_times_u.exit:
 ; CHECK-NEXT:    ret void
 ; CHECK:       meshBB:
-; CHECK-NEXT:    [[INDVARS_IV8_I_SV_PHI24]] = phi i64 [ undef, [[FOR_BODY3_US_I]] ], [ [[TMP3]], [[FOR_INC8_US_I]] ]
 ; CHECK-NEXT:    [[MESHSTACKVARIABLE_PHI:%.*]] = phi i32 [ [[OPQ_SA_CALC12]], [[FOR_BODY3_US_I]] ], [ undef, [[FOR_INC8_US_I]] ]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT_I]] = add i64 [[INDVARS_IV_I_SV_PHI]], 1
 ; CHECK-NEXT:    br i1 true, label [[FOR_BODY3_LR_PH_US_I_LOOPEXIT]], label [[FOR_BODY3_US_I]]
 ; CHECK:       meshBB1.loopexit:
 ; CHECK-NEXT:    br label [[MESHBB1]]

diff  --git a/llvm/test/Transforms/LoopUnroll/AArch64/runtime-unroll-generic.ll b/llvm/test/Transforms/LoopUnroll/AArch64/runtime-unroll-generic.ll
index f8b5f406c6a2670..9581bf8854a10b3 100644
--- a/llvm/test/Transforms/LoopUnroll/AArch64/runtime-unroll-generic.ll
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/runtime-unroll-generic.ll
@@ -9,116 +9,110 @@ define void @runtime_unroll_generic(i32 %arg_0, ptr %arg_1, ptr %arg_2, ptr %arg
 ; CHECK-A55-NEXT:    [[CMP52_NOT:%.*]] = icmp eq i32 [[ARG_0:%.*]], 0
 ; CHECK-A55-NEXT:    br i1 [[CMP52_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY6_PREHEADER:%.*]]
 ; CHECK-A55:       for.body6.preheader:
-; CHECK-A55-NEXT:    [[XTRAITER:%.*]] = and i32 [[ARG_0]], 3
+; CHECK-A55-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[ARG_0]] to i64
+; CHECK-A55-NEXT:    [[XTRAITER:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 3
 ; CHECK-A55-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[ARG_0]], 4
 ; CHECK-A55-NEXT:    br i1 [[TMP0]], label [[FOR_END_LOOPEXIT_UNR_LCSSA:%.*]], label [[FOR_BODY6_PREHEADER_NEW:%.*]]
 ; CHECK-A55:       for.body6.preheader.new:
-; CHECK-A55-NEXT:    [[UNROLL_ITER:%.*]] = and i32 [[ARG_0]], -4
+; CHECK-A55-NEXT:    [[UNROLL_ITER:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967292
 ; CHECK-A55-NEXT:    br label [[FOR_BODY6:%.*]]
 ; CHECK-A55:       for.body6:
-; CHECK-A55-NEXT:    [[K_03:%.*]] = phi i32 [ 0, [[FOR_BODY6_PREHEADER_NEW]] ], [ [[INC_3:%.*]], [[FOR_BODY6]] ]
-; CHECK-A55-NEXT:    [[NITER:%.*]] = phi i32 [ 0, [[FOR_BODY6_PREHEADER_NEW]] ], [ [[NITER_NEXT_3:%.*]], [[FOR_BODY6]] ]
-; CHECK-A55-NEXT:    [[IDX_EXT:%.*]] = zext i32 [[K_03]] to i64
-; CHECK-A55-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[ARG_2:%.*]], i64 [[IDX_EXT]]
+; CHECK-A55-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY6_PREHEADER_NEW]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_BODY6]] ]
+; CHECK-A55-NEXT:    [[NITER:%.*]] = phi i64 [ 0, [[FOR_BODY6_PREHEADER_NEW]] ], [ [[NITER_NEXT_3:%.*]], [[FOR_BODY6]] ]
+; CHECK-A55-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[ARG_2:%.*]], i64 [[INDVARS_IV]]
 ; CHECK-A55-NEXT:    [[TMP1:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2
 ; CHECK-A55-NEXT:    [[CONV:%.*]] = sext i16 [[TMP1]] to i32
-; CHECK-A55-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i16, ptr [[ARG_3:%.*]], i64 [[IDX_EXT]]
+; CHECK-A55-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i16, ptr [[ARG_3:%.*]], i64 [[INDVARS_IV]]
 ; CHECK-A55-NEXT:    [[TMP2:%.*]] = load i16, ptr [[ARRAYIDX14]], align 2
 ; CHECK-A55-NEXT:    [[CONV15:%.*]] = sext i16 [[TMP2]] to i32
 ; CHECK-A55-NEXT:    [[MUL16:%.*]] = mul nsw i32 [[CONV15]], [[CONV]]
-; CHECK-A55-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, ptr [[ARG_1:%.*]], i64 [[IDX_EXT]]
+; CHECK-A55-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, ptr [[ARG_1:%.*]], i64 [[INDVARS_IV]]
 ; CHECK-A55-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4
 ; CHECK-A55-NEXT:    [[ADD21:%.*]] = add nsw i32 [[MUL16]], [[TMP3]]
 ; CHECK-A55-NEXT:    store i32 [[ADD21]], ptr [[ARRAYIDX20]], align 4
-; CHECK-A55-NEXT:    [[INC:%.*]] = or i32 [[K_03]], 1
-; CHECK-A55-NEXT:    [[IDX_EXT_1:%.*]] = zext i32 [[INC]] to i64
-; CHECK-A55-NEXT:    [[ARRAYIDX10_1:%.*]] = getelementptr inbounds i16, ptr [[ARG_2]], i64 [[IDX_EXT_1]]
+; CHECK-A55-NEXT:    [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
+; CHECK-A55-NEXT:    [[ARRAYIDX10_1:%.*]] = getelementptr inbounds i16, ptr [[ARG_2]], i64 [[INDVARS_IV_NEXT]]
 ; CHECK-A55-NEXT:    [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX10_1]], align 2
 ; CHECK-A55-NEXT:    [[CONV_1:%.*]] = sext i16 [[TMP4]] to i32
-; CHECK-A55-NEXT:    [[ARRAYIDX14_1:%.*]] = getelementptr inbounds i16, ptr [[ARG_3]], i64 [[IDX_EXT_1]]
+; CHECK-A55-NEXT:    [[ARRAYIDX14_1:%.*]] = getelementptr inbounds i16, ptr [[ARG_3]], i64 [[INDVARS_IV_NEXT]]
 ; CHECK-A55-NEXT:    [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX14_1]], align 2
 ; CHECK-A55-NEXT:    [[CONV15_1:%.*]] = sext i16 [[TMP5]] to i32
 ; CHECK-A55-NEXT:    [[MUL16_1:%.*]] = mul nsw i32 [[CONV15_1]], [[CONV_1]]
-; CHECK-A55-NEXT:    [[ARRAYIDX20_1:%.*]] = getelementptr inbounds i32, ptr [[ARG_1]], i64 [[IDX_EXT_1]]
+; CHECK-A55-NEXT:    [[ARRAYIDX20_1:%.*]] = getelementptr inbounds i32, ptr [[ARG_1]], i64 [[INDVARS_IV_NEXT]]
 ; CHECK-A55-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX20_1]], align 4
 ; CHECK-A55-NEXT:    [[ADD21_1:%.*]] = add nsw i32 [[MUL16_1]], [[TMP6]]
 ; CHECK-A55-NEXT:    store i32 [[ADD21_1]], ptr [[ARRAYIDX20_1]], align 4
-; CHECK-A55-NEXT:    [[INC_1:%.*]] = or i32 [[K_03]], 2
-; CHECK-A55-NEXT:    [[IDX_EXT_2:%.*]] = zext i32 [[INC_1]] to i64
-; CHECK-A55-NEXT:    [[ARRAYIDX10_2:%.*]] = getelementptr inbounds i16, ptr [[ARG_2]], i64 [[IDX_EXT_2]]
+; CHECK-A55-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = or i64 [[INDVARS_IV]], 2
+; CHECK-A55-NEXT:    [[ARRAYIDX10_2:%.*]] = getelementptr inbounds i16, ptr [[ARG_2]], i64 [[INDVARS_IV_NEXT_1]]
 ; CHECK-A55-NEXT:    [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX10_2]], align 2
 ; CHECK-A55-NEXT:    [[CONV_2:%.*]] = sext i16 [[TMP7]] to i32
-; CHECK-A55-NEXT:    [[ARRAYIDX14_2:%.*]] = getelementptr inbounds i16, ptr [[ARG_3]], i64 [[IDX_EXT_2]]
+; CHECK-A55-NEXT:    [[ARRAYIDX14_2:%.*]] = getelementptr inbounds i16, ptr [[ARG_3]], i64 [[INDVARS_IV_NEXT_1]]
 ; CHECK-A55-NEXT:    [[TMP8:%.*]] = load i16, ptr [[ARRAYIDX14_2]], align 2
 ; CHECK-A55-NEXT:    [[CONV15_2:%.*]] = sext i16 [[TMP8]] to i32
 ; CHECK-A55-NEXT:    [[MUL16_2:%.*]] = mul nsw i32 [[CONV15_2]], [[CONV_2]]
-; CHECK-A55-NEXT:    [[ARRAYIDX20_2:%.*]] = getelementptr inbounds i32, ptr [[ARG_1]], i64 [[IDX_EXT_2]]
+; CHECK-A55-NEXT:    [[ARRAYIDX20_2:%.*]] = getelementptr inbounds i32, ptr [[ARG_1]], i64 [[INDVARS_IV_NEXT_1]]
 ; CHECK-A55-NEXT:    [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX20_2]], align 4
 ; CHECK-A55-NEXT:    [[ADD21_2:%.*]] = add nsw i32 [[MUL16_2]], [[TMP9]]
 ; CHECK-A55-NEXT:    store i32 [[ADD21_2]], ptr [[ARRAYIDX20_2]], align 4
-; CHECK-A55-NEXT:    [[INC_2:%.*]] = or i32 [[K_03]], 3
-; CHECK-A55-NEXT:    [[IDX_EXT_3:%.*]] = zext i32 [[INC_2]] to i64
-; CHECK-A55-NEXT:    [[ARRAYIDX10_3:%.*]] = getelementptr inbounds i16, ptr [[ARG_2]], i64 [[IDX_EXT_3]]
+; CHECK-A55-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = or i64 [[INDVARS_IV]], 3
+; CHECK-A55-NEXT:    [[ARRAYIDX10_3:%.*]] = getelementptr inbounds i16, ptr [[ARG_2]], i64 [[INDVARS_IV_NEXT_2]]
 ; CHECK-A55-NEXT:    [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX10_3]], align 2
 ; CHECK-A55-NEXT:    [[CONV_3:%.*]] = sext i16 [[TMP10]] to i32
-; CHECK-A55-NEXT:    [[ARRAYIDX14_3:%.*]] = getelementptr inbounds i16, ptr [[ARG_3]], i64 [[IDX_EXT_3]]
+; CHECK-A55-NEXT:    [[ARRAYIDX14_3:%.*]] = getelementptr inbounds i16, ptr [[ARG_3]], i64 [[INDVARS_IV_NEXT_2]]
 ; CHECK-A55-NEXT:    [[TMP11:%.*]] = load i16, ptr [[ARRAYIDX14_3]], align 2
 ; CHECK-A55-NEXT:    [[CONV15_3:%.*]] = sext i16 [[TMP11]] to i32
 ; CHECK-A55-NEXT:    [[MUL16_3:%.*]] = mul nsw i32 [[CONV15_3]], [[CONV_3]]
-; CHECK-A55-NEXT:    [[ARRAYIDX20_3:%.*]] = getelementptr inbounds i32, ptr [[ARG_1]], i64 [[IDX_EXT_3]]
+; CHECK-A55-NEXT:    [[ARRAYIDX20_3:%.*]] = getelementptr inbounds i32, ptr [[ARG_1]], i64 [[INDVARS_IV_NEXT_2]]
 ; CHECK-A55-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX20_3]], align 4
 ; CHECK-A55-NEXT:    [[ADD21_3:%.*]] = add nsw i32 [[MUL16_3]], [[TMP12]]
 ; CHECK-A55-NEXT:    store i32 [[ADD21_3]], ptr [[ARRAYIDX20_3]], align 4
-; CHECK-A55-NEXT:    [[INC_3]] = add nuw i32 [[K_03]], 4
-; CHECK-A55-NEXT:    [[NITER_NEXT_3]] = add i32 [[NITER]], 4
-; CHECK-A55-NEXT:    [[NITER_NCMP_3_NOT:%.*]] = icmp eq i32 [[NITER_NEXT_3]], [[UNROLL_ITER]]
-; CHECK-A55-NEXT:    br i1 [[NITER_NCMP_3_NOT]], label [[FOR_END_LOOPEXIT_UNR_LCSSA]], label [[FOR_BODY6]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-A55-NEXT:    [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 4
+; CHECK-A55-NEXT:    [[NITER_NEXT_3]] = add i64 [[NITER]], 4
+; CHECK-A55-NEXT:    [[NITER_NCMP_3:%.*]] = icmp eq i64 [[NITER_NEXT_3]], [[UNROLL_ITER]]
+; CHECK-A55-NEXT:    br i1 [[NITER_NCMP_3]], label [[FOR_END_LOOPEXIT_UNR_LCSSA]], label [[FOR_BODY6]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK-A55:       for.end.loopexit.unr-lcssa:
-; CHECK-A55-NEXT:    [[K_03_UNR:%.*]] = phi i32 [ 0, [[FOR_BODY6_PREHEADER]] ], [ [[INC_3]], [[FOR_BODY6]] ]
-; CHECK-A55-NEXT:    [[LCMP_MOD_NOT:%.*]] = icmp eq i32 [[XTRAITER]], 0
+; CHECK-A55-NEXT:    [[INDVARS_IV_UNR:%.*]] = phi i64 [ 0, [[FOR_BODY6_PREHEADER]] ], [ [[INDVARS_IV_NEXT_3]], [[FOR_BODY6]] ]
+; CHECK-A55-NEXT:    [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0
 ; CHECK-A55-NEXT:    br i1 [[LCMP_MOD_NOT]], label [[FOR_END]], label [[FOR_BODY6_EPIL:%.*]]
 ; CHECK-A55:       for.body6.epil:
-; CHECK-A55-NEXT:    [[IDX_EXT_EPIL:%.*]] = zext i32 [[K_03_UNR]] to i64
-; CHECK-A55-NEXT:    [[ARRAYIDX10_EPIL:%.*]] = getelementptr inbounds i16, ptr [[ARG_2]], i64 [[IDX_EXT_EPIL]]
+; CHECK-A55-NEXT:    [[ARRAYIDX10_EPIL:%.*]] = getelementptr inbounds i16, ptr [[ARG_2]], i64 [[INDVARS_IV_UNR]]
 ; CHECK-A55-NEXT:    [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX10_EPIL]], align 2
 ; CHECK-A55-NEXT:    [[CONV_EPIL:%.*]] = sext i16 [[TMP13]] to i32
-; CHECK-A55-NEXT:    [[ARRAYIDX14_EPIL:%.*]] = getelementptr inbounds i16, ptr [[ARG_3]], i64 [[IDX_EXT_EPIL]]
+; CHECK-A55-NEXT:    [[ARRAYIDX14_EPIL:%.*]] = getelementptr inbounds i16, ptr [[ARG_3]], i64 [[INDVARS_IV_UNR]]
 ; CHECK-A55-NEXT:    [[TMP14:%.*]] = load i16, ptr [[ARRAYIDX14_EPIL]], align 2
 ; CHECK-A55-NEXT:    [[CONV15_EPIL:%.*]] = sext i16 [[TMP14]] to i32
 ; CHECK-A55-NEXT:    [[MUL16_EPIL:%.*]] = mul nsw i32 [[CONV15_EPIL]], [[CONV_EPIL]]
-; CHECK-A55-NEXT:    [[ARRAYIDX20_EPIL:%.*]] = getelementptr inbounds i32, ptr [[ARG_1]], i64 [[IDX_EXT_EPIL]]
+; CHECK-A55-NEXT:    [[ARRAYIDX20_EPIL:%.*]] = getelementptr inbounds i32, ptr [[ARG_1]], i64 [[INDVARS_IV_UNR]]
 ; CHECK-A55-NEXT:    [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX20_EPIL]], align 4
 ; CHECK-A55-NEXT:    [[ADD21_EPIL:%.*]] = add nsw i32 [[MUL16_EPIL]], [[TMP15]]
 ; CHECK-A55-NEXT:    store i32 [[ADD21_EPIL]], ptr [[ARRAYIDX20_EPIL]], align 4
-; CHECK-A55-NEXT:    [[EPIL_ITER_CMP_NOT:%.*]] = icmp eq i32 [[XTRAITER]], 1
+; CHECK-A55-NEXT:    [[EPIL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 1
 ; CHECK-A55-NEXT:    br i1 [[EPIL_ITER_CMP_NOT]], label [[FOR_END]], label [[FOR_BODY6_EPIL_1:%.*]]
 ; CHECK-A55:       for.body6.epil.1:
-; CHECK-A55-NEXT:    [[INC_EPIL:%.*]] = add nuw i32 [[K_03_UNR]], 1
-; CHECK-A55-NEXT:    [[IDX_EXT_EPIL_1:%.*]] = zext i32 [[INC_EPIL]] to i64
-; CHECK-A55-NEXT:    [[ARRAYIDX10_EPIL_1:%.*]] = getelementptr inbounds i16, ptr [[ARG_2]], i64 [[IDX_EXT_EPIL_1]]
+; CHECK-A55-NEXT:    [[INDVARS_IV_NEXT_EPIL:%.*]] = add nuw nsw i64 [[INDVARS_IV_UNR]], 1
+; CHECK-A55-NEXT:    [[ARRAYIDX10_EPIL_1:%.*]] = getelementptr inbounds i16, ptr [[ARG_2]], i64 [[INDVARS_IV_NEXT_EPIL]]
 ; CHECK-A55-NEXT:    [[TMP16:%.*]] = load i16, ptr [[ARRAYIDX10_EPIL_1]], align 2
 ; CHECK-A55-NEXT:    [[CONV_EPIL_1:%.*]] = sext i16 [[TMP16]] to i32
-; CHECK-A55-NEXT:    [[ARRAYIDX14_EPIL_1:%.*]] = getelementptr inbounds i16, ptr [[ARG_3]], i64 [[IDX_EXT_EPIL_1]]
+; CHECK-A55-NEXT:    [[ARRAYIDX14_EPIL_1:%.*]] = getelementptr inbounds i16, ptr [[ARG_3]], i64 [[INDVARS_IV_NEXT_EPIL]]
 ; CHECK-A55-NEXT:    [[TMP17:%.*]] = load i16, ptr [[ARRAYIDX14_EPIL_1]], align 2
 ; CHECK-A55-NEXT:    [[CONV15_EPIL_1:%.*]] = sext i16 [[TMP17]] to i32
 ; CHECK-A55-NEXT:    [[MUL16_EPIL_1:%.*]] = mul nsw i32 [[CONV15_EPIL_1]], [[CONV_EPIL_1]]
-; CHECK-A55-NEXT:    [[ARRAYIDX20_EPIL_1:%.*]] = getelementptr inbounds i32, ptr [[ARG_1]], i64 [[IDX_EXT_EPIL_1]]
+; CHECK-A55-NEXT:    [[ARRAYIDX20_EPIL_1:%.*]] = getelementptr inbounds i32, ptr [[ARG_1]], i64 [[INDVARS_IV_NEXT_EPIL]]
 ; CHECK-A55-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX20_EPIL_1]], align 4
 ; CHECK-A55-NEXT:    [[ADD21_EPIL_1:%.*]] = add nsw i32 [[MUL16_EPIL_1]], [[TMP18]]
 ; CHECK-A55-NEXT:    store i32 [[ADD21_EPIL_1]], ptr [[ARRAYIDX20_EPIL_1]], align 4
-; CHECK-A55-NEXT:    [[EPIL_ITER_CMP_1_NOT:%.*]] = icmp eq i32 [[XTRAITER]], 2
+; CHECK-A55-NEXT:    [[EPIL_ITER_CMP_1_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 2
 ; CHECK-A55-NEXT:    br i1 [[EPIL_ITER_CMP_1_NOT]], label [[FOR_END]], label [[FOR_BODY6_EPIL_2:%.*]]
 ; CHECK-A55:       for.body6.epil.2:
-; CHECK-A55-NEXT:    [[INC_EPIL_1:%.*]] = add nuw i32 [[K_03_UNR]], 2
-; CHECK-A55-NEXT:    [[IDX_EXT_EPIL_2:%.*]] = zext i32 [[INC_EPIL_1]] to i64
-; CHECK-A55-NEXT:    [[ARRAYIDX10_EPIL_2:%.*]] = getelementptr inbounds i16, ptr [[ARG_2]], i64 [[IDX_EXT_EPIL_2]]
+; CHECK-A55-NEXT:    [[INDVARS_IV_NEXT_EPIL_1:%.*]] = add nuw nsw i64 [[INDVARS_IV_UNR]], 2
+; CHECK-A55-NEXT:    [[ARRAYIDX10_EPIL_2:%.*]] = getelementptr inbounds i16, ptr [[ARG_2]], i64 [[INDVARS_IV_NEXT_EPIL_1]]
 ; CHECK-A55-NEXT:    [[TMP19:%.*]] = load i16, ptr [[ARRAYIDX10_EPIL_2]], align 2
 ; CHECK-A55-NEXT:    [[CONV_EPIL_2:%.*]] = sext i16 [[TMP19]] to i32
-; CHECK-A55-NEXT:    [[ARRAYIDX14_EPIL_2:%.*]] = getelementptr inbounds i16, ptr [[ARG_3]], i64 [[IDX_EXT_EPIL_2]]
+; CHECK-A55-NEXT:    [[ARRAYIDX14_EPIL_2:%.*]] = getelementptr inbounds i16, ptr [[ARG_3]], i64 [[INDVARS_IV_NEXT_EPIL_1]]
 ; CHECK-A55-NEXT:    [[TMP20:%.*]] = load i16, ptr [[ARRAYIDX14_EPIL_2]], align 2
 ; CHECK-A55-NEXT:    [[CONV15_EPIL_2:%.*]] = sext i16 [[TMP20]] to i32
 ; CHECK-A55-NEXT:    [[MUL16_EPIL_2:%.*]] = mul nsw i32 [[CONV15_EPIL_2]], [[CONV_EPIL_2]]
-; CHECK-A55-NEXT:    [[ARRAYIDX20_EPIL_2:%.*]] = getelementptr inbounds i32, ptr [[ARG_1]], i64 [[IDX_EXT_EPIL_2]]
+; CHECK-A55-NEXT:    [[ARRAYIDX20_EPIL_2:%.*]] = getelementptr inbounds i32, ptr [[ARG_1]], i64 [[INDVARS_IV_NEXT_EPIL_1]]
 ; CHECK-A55-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX20_EPIL_2]], align 4
 ; CHECK-A55-NEXT:    [[ADD21_EPIL_2:%.*]] = add nsw i32 [[MUL16_EPIL_2]], [[TMP21]]
 ; CHECK-A55-NEXT:    store i32 [[ADD21_EPIL_2]], ptr [[ARRAYIDX20_EPIL_2]], align 4
@@ -129,24 +123,26 @@ define void @runtime_unroll_generic(i32 %arg_0, ptr %arg_1, ptr %arg_2, ptr %arg
 ; CHECK-GENERIC-LABEL: @runtime_unroll_generic(
 ; CHECK-GENERIC-NEXT:  entry:
 ; CHECK-GENERIC-NEXT:    [[CMP52_NOT:%.*]] = icmp eq i32 [[ARG_0:%.*]], 0
-; CHECK-GENERIC-NEXT:    br i1 [[CMP52_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY6:%.*]]
+; CHECK-GENERIC-NEXT:    br i1 [[CMP52_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY6_PREHEADER:%.*]]
+; CHECK-GENERIC:       for.body6.preheader:
+; CHECK-GENERIC-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[ARG_0]] to i64
+; CHECK-GENERIC-NEXT:    br label [[FOR_BODY6:%.*]]
 ; CHECK-GENERIC:       for.body6:
-; CHECK-GENERIC-NEXT:    [[K_03:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY6]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-GENERIC-NEXT:    [[IDX_EXT:%.*]] = zext i32 [[K_03]] to i64
-; CHECK-GENERIC-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[ARG_2:%.*]], i64 [[IDX_EXT]]
+; CHECK-GENERIC-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY6_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY6]] ]
+; CHECK-GENERIC-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[ARG_2:%.*]], i64 [[INDVARS_IV]]
 ; CHECK-GENERIC-NEXT:    [[TMP0:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2
 ; CHECK-GENERIC-NEXT:    [[CONV:%.*]] = sext i16 [[TMP0]] to i32
-; CHECK-GENERIC-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i16, ptr [[ARG_3:%.*]], i64 [[IDX_EXT]]
+; CHECK-GENERIC-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i16, ptr [[ARG_3:%.*]], i64 [[INDVARS_IV]]
 ; CHECK-GENERIC-NEXT:    [[TMP1:%.*]] = load i16, ptr [[ARRAYIDX14]], align 2
 ; CHECK-GENERIC-NEXT:    [[CONV15:%.*]] = sext i16 [[TMP1]] to i32
 ; CHECK-GENERIC-NEXT:    [[MUL16:%.*]] = mul nsw i32 [[CONV15]], [[CONV]]
-; CHECK-GENERIC-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, ptr [[ARG_1:%.*]], i64 [[IDX_EXT]]
+; CHECK-GENERIC-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, ptr [[ARG_1:%.*]], i64 [[INDVARS_IV]]
 ; CHECK-GENERIC-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4
 ; CHECK-GENERIC-NEXT:    [[ADD21:%.*]] = add nsw i32 [[MUL16]], [[TMP2]]
 ; CHECK-GENERIC-NEXT:    store i32 [[ADD21]], ptr [[ARRAYIDX20]], align 4
-; CHECK-GENERIC-NEXT:    [[INC]] = add nuw i32 [[K_03]], 1
-; CHECK-GENERIC-NEXT:    [[CMP5:%.*]] = icmp ult i32 [[INC]], [[ARG_0]]
-; CHECK-GENERIC-NEXT:    br i1 [[CMP5]], label [[FOR_BODY6]], label [[FOR_END]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-GENERIC-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-GENERIC-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-GENERIC-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY6]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK-GENERIC:       for.end:
 ; CHECK-GENERIC-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/LoopUnroll/ARM/instr-size-costs.ll b/llvm/test/Transforms/LoopUnroll/ARM/instr-size-costs.ll
index e1701ee75ea461c..216bf489bc66ecb 100644
--- a/llvm/test/Transforms/LoopUnroll/ARM/instr-size-costs.ll
+++ b/llvm/test/Transforms/LoopUnroll/ARM/instr-size-costs.ll
@@ -103,19 +103,19 @@ define void @test_i64_add_optsize(ptr %a, ptr %b, ptr %c) #0 {
 ; CHECK-V8-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[COUNT_1:%.*]], [[LOOP]] ]
 ; CHECK-V8-NEXT:    [[ADDR_A:%.*]] = getelementptr i64, ptr [[A:%.*]], i32 [[IV]]
 ; CHECK-V8-NEXT:    [[ADDR_B:%.*]] = getelementptr i64, ptr [[B:%.*]], i32 [[IV]]
-; CHECK-V8-NEXT:    [[DATA_A:%.*]] = load i64, ptr [[ADDR_A]], align 4
-; CHECK-V8-NEXT:    [[DATA_B:%.*]] = load i64, ptr [[ADDR_B]], align 4
+; CHECK-V8-NEXT:    [[DATA_A:%.*]] = load i64, ptr [[ADDR_A]], align 8
+; CHECK-V8-NEXT:    [[DATA_B:%.*]] = load i64, ptr [[ADDR_B]], align 8
 ; CHECK-V8-NEXT:    [[RES:%.*]] = add i64 [[DATA_A]], [[DATA_B]]
 ; CHECK-V8-NEXT:    [[ADDR_C:%.*]] = getelementptr i64, ptr [[C:%.*]], i32 [[IV]]
-; CHECK-V8-NEXT:    store i64 [[RES]], ptr [[ADDR_C]], align 4
+; CHECK-V8-NEXT:    store i64 [[RES]], ptr [[ADDR_C]], align 8
 ; CHECK-V8-NEXT:    [[COUNT:%.*]] = add nuw nsw i32 [[IV]], 1
 ; CHECK-V8-NEXT:    [[ADDR_A_1:%.*]] = getelementptr i64, ptr [[A]], i32 [[COUNT]]
 ; CHECK-V8-NEXT:    [[ADDR_B_1:%.*]] = getelementptr i64, ptr [[B]], i32 [[COUNT]]
-; CHECK-V8-NEXT:    [[DATA_A_1:%.*]] = load i64, ptr [[ADDR_A_1]], align 4
-; CHECK-V8-NEXT:    [[DATA_B_1:%.*]] = load i64, ptr [[ADDR_B_1]], align 4
+; CHECK-V8-NEXT:    [[DATA_A_1:%.*]] = load i64, ptr [[ADDR_A_1]], align 8
+; CHECK-V8-NEXT:    [[DATA_B_1:%.*]] = load i64, ptr [[ADDR_B_1]], align 8
 ; CHECK-V8-NEXT:    [[RES_1:%.*]] = add i64 [[DATA_A_1]], [[DATA_B_1]]
 ; CHECK-V8-NEXT:    [[ADDR_C_1:%.*]] = getelementptr i64, ptr [[C]], i32 [[COUNT]]
-; CHECK-V8-NEXT:    store i64 [[RES_1]], ptr [[ADDR_C_1]], align 4
+; CHECK-V8-NEXT:    store i64 [[RES_1]], ptr [[ADDR_C_1]], align 8
 ; CHECK-V8-NEXT:    [[COUNT_1]] = add nuw nsw i32 [[IV]], 2
 ; CHECK-V8-NEXT:    [[END_1:%.*]] = icmp ne i32 [[COUNT_1]], 100
 ; CHECK-V8-NEXT:    br i1 [[END_1]], label [[LOOP]], label [[EXIT:%.*]]
@@ -150,19 +150,19 @@ define void @test_i64_add_minsize(ptr %a, ptr %b, ptr %c) #1 {
 ; CHECK-V8-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[COUNT_1:%.*]], [[LOOP]] ]
 ; CHECK-V8-NEXT:    [[ADDR_A:%.*]] = getelementptr i64, ptr [[A:%.*]], i32 [[IV]]
 ; CHECK-V8-NEXT:    [[ADDR_B:%.*]] = getelementptr i64, ptr [[B:%.*]], i32 [[IV]]
-; CHECK-V8-NEXT:    [[DATA_A:%.*]] = load i64, ptr [[ADDR_A]], align 4
-; CHECK-V8-NEXT:    [[DATA_B:%.*]] = load i64, ptr [[ADDR_B]], align 4
+; CHECK-V8-NEXT:    [[DATA_A:%.*]] = load i64, ptr [[ADDR_A]], align 8
+; CHECK-V8-NEXT:    [[DATA_B:%.*]] = load i64, ptr [[ADDR_B]], align 8
 ; CHECK-V8-NEXT:    [[RES:%.*]] = add i64 [[DATA_A]], [[DATA_B]]
 ; CHECK-V8-NEXT:    [[ADDR_C:%.*]] = getelementptr i64, ptr [[C:%.*]], i32 [[IV]]
-; CHECK-V8-NEXT:    store i64 [[RES]], ptr [[ADDR_C]], align 4
+; CHECK-V8-NEXT:    store i64 [[RES]], ptr [[ADDR_C]], align 8
 ; CHECK-V8-NEXT:    [[COUNT:%.*]] = add nuw nsw i32 [[IV]], 1
 ; CHECK-V8-NEXT:    [[ADDR_A_1:%.*]] = getelementptr i64, ptr [[A]], i32 [[COUNT]]
 ; CHECK-V8-NEXT:    [[ADDR_B_1:%.*]] = getelementptr i64, ptr [[B]], i32 [[COUNT]]
-; CHECK-V8-NEXT:    [[DATA_A_1:%.*]] = load i64, ptr [[ADDR_A_1]], align 4
-; CHECK-V8-NEXT:    [[DATA_B_1:%.*]] = load i64, ptr [[ADDR_B_1]], align 4
+; CHECK-V8-NEXT:    [[DATA_A_1:%.*]] = load i64, ptr [[ADDR_A_1]], align 8
+; CHECK-V8-NEXT:    [[DATA_B_1:%.*]] = load i64, ptr [[ADDR_B_1]], align 8
 ; CHECK-V8-NEXT:    [[RES_1:%.*]] = add i64 [[DATA_A_1]], [[DATA_B_1]]
 ; CHECK-V8-NEXT:    [[ADDR_C_1:%.*]] = getelementptr i64, ptr [[C]], i32 [[COUNT]]
-; CHECK-V8-NEXT:    store i64 [[RES_1]], ptr [[ADDR_C_1]], align 4
+; CHECK-V8-NEXT:    store i64 [[RES_1]], ptr [[ADDR_C_1]], align 8
 ; CHECK-V8-NEXT:    [[COUNT_1]] = add nuw nsw i32 [[IV]], 2
 ; CHECK-V8-NEXT:    [[END_1:%.*]] = icmp ne i32 [[COUNT_1]], 100
 ; CHECK-V8-NEXT:    br i1 [[END_1]], label [[LOOP]], label [[EXIT:%.*]]
@@ -310,13 +310,13 @@ define i64 @test_i64_select_optsize(ptr %a, ptr %b, ptr %c) #0 {
 ; CHECK-V8-NEXT:    [[ACC:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[ACC_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-V8-NEXT:    [[ADDR_A:%.*]] = getelementptr i64, ptr [[A:%.*]], i32 [[IV]]
 ; CHECK-V8-NEXT:    [[ADDR_B:%.*]] = getelementptr i64, ptr [[B:%.*]], i32 [[IV]]
-; CHECK-V8-NEXT:    [[DATA_A:%.*]] = load i64, ptr [[ADDR_A]], align 4
-; CHECK-V8-NEXT:    [[DATA_B:%.*]] = load i64, ptr [[ADDR_B]], align 4
+; CHECK-V8-NEXT:    [[DATA_A:%.*]] = load i64, ptr [[ADDR_A]], align 8
+; CHECK-V8-NEXT:    [[DATA_B:%.*]] = load i64, ptr [[ADDR_B]], align 8
 ; CHECK-V8-NEXT:    [[UGT:%.*]] = icmp ugt i64 [[DATA_A]], [[DATA_B]]
 ; CHECK-V8-NEXT:    [[UMAX:%.*]] = select i1 [[UGT]], i64 [[DATA_A]], i64 [[DATA_B]]
 ; CHECK-V8-NEXT:    [[ACC_NEXT]] = add i64 [[UMAX]], [[ACC]]
 ; CHECK-V8-NEXT:    [[ADDR_C:%.*]] = getelementptr i64, ptr [[C:%.*]], i32 [[IV]]
-; CHECK-V8-NEXT:    store i64 [[UMAX]], ptr [[ADDR_C]], align 4
+; CHECK-V8-NEXT:    store i64 [[UMAX]], ptr [[ADDR_C]], align 8
 ; CHECK-V8-NEXT:    [[COUNT]] = add nuw i32 [[IV]], 1
 ; CHECK-V8-NEXT:    [[END:%.*]] = icmp ne i32 [[COUNT]], 100
 ; CHECK-V8-NEXT:    br i1 [[END]], label [[LOOP]], label [[EXIT:%.*]]
@@ -356,13 +356,13 @@ define i64 @test_i64_select_minsize(ptr %a, ptr %b, ptr %c) #1 {
 ; CHECK-V8-NEXT:    [[ACC:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[ACC_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-V8-NEXT:    [[ADDR_A:%.*]] = getelementptr i64, ptr [[A:%.*]], i32 [[IV]]
 ; CHECK-V8-NEXT:    [[ADDR_B:%.*]] = getelementptr i64, ptr [[B:%.*]], i32 [[IV]]
-; CHECK-V8-NEXT:    [[DATA_A:%.*]] = load i64, ptr [[ADDR_A]], align 4
-; CHECK-V8-NEXT:    [[DATA_B:%.*]] = load i64, ptr [[ADDR_B]], align 4
+; CHECK-V8-NEXT:    [[DATA_A:%.*]] = load i64, ptr [[ADDR_A]], align 8
+; CHECK-V8-NEXT:    [[DATA_B:%.*]] = load i64, ptr [[ADDR_B]], align 8
 ; CHECK-V8-NEXT:    [[UGT:%.*]] = icmp ugt i64 [[DATA_A]], [[DATA_B]]
 ; CHECK-V8-NEXT:    [[UMAX:%.*]] = select i1 [[UGT]], i64 [[DATA_A]], i64 [[DATA_B]]
 ; CHECK-V8-NEXT:    [[ACC_NEXT]] = add i64 [[UMAX]], [[ACC]]
 ; CHECK-V8-NEXT:    [[ADDR_C:%.*]] = getelementptr i64, ptr [[C:%.*]], i32 [[IV]]
-; CHECK-V8-NEXT:    store i64 [[UMAX]], ptr [[ADDR_C]], align 4
+; CHECK-V8-NEXT:    store i64 [[UMAX]], ptr [[ADDR_C]], align 8
 ; CHECK-V8-NEXT:    [[COUNT]] = add nuw i32 [[IV]], 1
 ; CHECK-V8-NEXT:    [[END:%.*]] = icmp ne i32 [[COUNT]], 100
 ; CHECK-V8-NEXT:    br i1 [[END]], label [[LOOP]], label [[EXIT:%.*]]

diff  --git a/llvm/test/Transforms/LoopUnroll/ARM/upperbound.ll b/llvm/test/Transforms/LoopUnroll/ARM/upperbound.ll
index ae23a2e9ce35231..c2e3f59ec678d3b 100644
--- a/llvm/test/Transforms/LoopUnroll/ARM/upperbound.ll
+++ b/llvm/test/Transforms/LoopUnroll/ARM/upperbound.ll
@@ -17,7 +17,7 @@ define void @test(ptr %x, i32 %n) {
 ; CHECK-NEXT:    store i32 0, ptr [[X]], align 4
 ; CHECK-NEXT:    br label [[IF_END]]
 ; CHECK:       if.end:
-; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 1
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[REM]], 1
 ; CHECK-NEXT:    br i1 [[CMP]], label [[WHILE_BODY_1:%.*]], label [[WHILE_END]]
 ; CHECK:       while.body.1:
@@ -28,7 +28,7 @@ define void @test(ptr %x, i32 %n) {
 ; CHECK-NEXT:    store i32 0, ptr [[INCDEC_PTR]], align 4
 ; CHECK-NEXT:    br label [[IF_END_1]]
 ; CHECK:       if.end.1:
-; CHECK-NEXT:    [[INCDEC_PTR_1:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR_1:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 2
 ; CHECK-NEXT:    [[CMP_1:%.*]] = icmp sgt i32 [[REM]], 2
 ; CHECK-NEXT:    br i1 [[CMP_1]], label [[WHILE_BODY_2:%.*]], label [[WHILE_END]]
 ; CHECK:       while.body.2:

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
index a78878c55a1fc93..19a970fb0716f0f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
@@ -23,10 +23,10 @@ define void @test_widen(ptr noalias %a, ptr readnone %b) #4 {
 ; TFNONE:       vector.body:
 ; TFNONE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; TFNONE-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]]
-; TFNONE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP4]], align 4
+; TFNONE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP4]], align 8
 ; TFNONE-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_LOAD]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; TFNONE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; TFNONE-NEXT:    store <vscale x 2 x i64> [[TMP5]], ptr [[TMP6]], align 4
+; TFNONE-NEXT:    store <vscale x 2 x i64> [[TMP5]], ptr [[TMP6]], align 8
 ; TFNONE-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; TFNONE-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
 ; TFNONE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
@@ -40,10 +40,10 @@ define void @test_widen(ptr noalias %a, ptr readnone %b) #4 {
 ; TFNONE:       for.body:
 ; TFNONE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; TFNONE-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]]
-; TFNONE-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4
+; TFNONE-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8
 ; TFNONE-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR3:[0-9]+]]
 ; TFNONE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]
-; TFNONE-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 4
+; TFNONE-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 8
 ; TFNONE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; TFNONE-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1025
 ; TFNONE-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
@@ -66,10 +66,10 @@ define void @test_widen(ptr noalias %a, ptr readnone %b) #4 {
 ; TFCOMMON-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; TFCOMMON-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; TFCOMMON-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]]
-; TFCOMMON-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; TFCOMMON-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
 ; TFCOMMON-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFCOMMON-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; TFCOMMON-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP6]], ptr [[TMP7]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFCOMMON-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP6]], ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFCOMMON-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; TFCOMMON-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
 ; TFCOMMON-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
@@ -359,10 +359,10 @@ define void @test_widen_nomask(ptr noalias %a, ptr readnone %b) #4 {
 ; TFNONE:       vector.body:
 ; TFNONE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; TFNONE-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]]
-; TFNONE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP4]], align 4
+; TFNONE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP4]], align 8
 ; TFNONE-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @foo_vector_nomask(<vscale x 2 x i64> [[WIDE_LOAD]])
 ; TFNONE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; TFNONE-NEXT:    store <vscale x 2 x i64> [[TMP5]], ptr [[TMP6]], align 4
+; TFNONE-NEXT:    store <vscale x 2 x i64> [[TMP5]], ptr [[TMP6]], align 8
 ; TFNONE-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; TFNONE-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
 ; TFNONE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
@@ -376,10 +376,10 @@ define void @test_widen_nomask(ptr noalias %a, ptr readnone %b) #4 {
 ; TFNONE:       for.body:
 ; TFNONE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; TFNONE-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]]
-; TFNONE-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4
+; TFNONE-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8
 ; TFNONE-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR5:[0-9]+]]
 ; TFNONE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]
-; TFNONE-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 4
+; TFNONE-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 8
 ; TFNONE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; TFNONE-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1025
 ; TFNONE-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
@@ -392,10 +392,10 @@ define void @test_widen_nomask(ptr noalias %a, ptr readnone %b) #4 {
 ; TFALWAYS:       for.body:
 ; TFALWAYS-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; TFALWAYS-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDVARS_IV]]
-; TFALWAYS-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4
+; TFALWAYS-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8
 ; TFALWAYS-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR5:[0-9]+]]
 ; TFALWAYS-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDVARS_IV]]
-; TFALWAYS-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 4
+; TFALWAYS-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 8
 ; TFALWAYS-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; TFALWAYS-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1025
 ; TFALWAYS-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
@@ -417,10 +417,10 @@ define void @test_widen_nomask(ptr noalias %a, ptr readnone %b) #4 {
 ; TFFALLBACK:       vector.body:
 ; TFFALLBACK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; TFFALLBACK-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]]
-; TFFALLBACK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP4]], align 4
+; TFFALLBACK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP4]], align 8
 ; TFFALLBACK-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @foo_vector_nomask(<vscale x 2 x i64> [[WIDE_LOAD]])
 ; TFFALLBACK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; TFFALLBACK-NEXT:    store <vscale x 2 x i64> [[TMP5]], ptr [[TMP6]], align 4
+; TFFALLBACK-NEXT:    store <vscale x 2 x i64> [[TMP5]], ptr [[TMP6]], align 8
 ; TFFALLBACK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; TFFALLBACK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
 ; TFFALLBACK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
@@ -432,10 +432,10 @@ define void @test_widen_nomask(ptr noalias %a, ptr readnone %b) #4 {
 ; TFFALLBACK:       for.body:
 ; TFFALLBACK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; TFFALLBACK-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]]
-; TFFALLBACK-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4
+; TFFALLBACK-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8
 ; TFFALLBACK-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR5:[0-9]+]]
 ; TFFALLBACK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]
-; TFFALLBACK-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 4
+; TFFALLBACK-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 8
 ; TFFALLBACK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; TFFALLBACK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1025
 ; TFFALLBACK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -479,10 +479,10 @@ define void @test_widen_optmask(ptr noalias %a, ptr readnone %b) #4 {
 ; TFNONE:       vector.body:
 ; TFNONE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; TFNONE-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]]
-; TFNONE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP4]], align 4
+; TFNONE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP4]], align 8
 ; TFNONE-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @foo_vector_nomask(<vscale x 2 x i64> [[WIDE_LOAD]])
 ; TFNONE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; TFNONE-NEXT:    store <vscale x 2 x i64> [[TMP5]], ptr [[TMP6]], align 4
+; TFNONE-NEXT:    store <vscale x 2 x i64> [[TMP5]], ptr [[TMP6]], align 8
 ; TFNONE-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; TFNONE-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
 ; TFNONE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
@@ -496,10 +496,10 @@ define void @test_widen_optmask(ptr noalias %a, ptr readnone %b) #4 {
 ; TFNONE:       for.body:
 ; TFNONE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; TFNONE-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]]
-; TFNONE-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4
+; TFNONE-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8
 ; TFNONE-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR6:[0-9]+]]
 ; TFNONE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]
-; TFNONE-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 4
+; TFNONE-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 8
 ; TFNONE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; TFNONE-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1025
 ; TFNONE-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
@@ -522,10 +522,10 @@ define void @test_widen_optmask(ptr noalias %a, ptr readnone %b) #4 {
 ; TFALWAYS-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; TFALWAYS-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; TFALWAYS-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]]
-; TFALWAYS-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; TFALWAYS-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
 ; TFALWAYS-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFALWAYS-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; TFALWAYS-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP6]], ptr [[TMP7]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFALWAYS-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP6]], ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFALWAYS-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; TFALWAYS-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
 ; TFALWAYS-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
@@ -552,10 +552,10 @@ define void @test_widen_optmask(ptr noalias %a, ptr readnone %b) #4 {
 ; TFFALLBACK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; TFFALLBACK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; TFFALLBACK-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]]
-; TFFALLBACK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; TFFALLBACK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
 ; TFFALLBACK-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFFALLBACK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; TFFALLBACK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP6]], ptr [[TMP7]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFFALLBACK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP6]], ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFFALLBACK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; TFFALLBACK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
 ; TFFALLBACK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
@@ -611,7 +611,7 @@ define double @test_widen_fmuladd_and_call(ptr noalias %a, ptr readnone %b, doub
 ; TFNONE-NEXT:    [[TMP6:%.*]] = fptoui <vscale x 2 x double> [[WIDE_LOAD]] to <vscale x 2 x i64>
 ; TFNONE-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[TMP6]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; TFNONE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; TFNONE-NEXT:    store <vscale x 2 x i64> [[TMP7]], ptr [[TMP8]], align 4
+; TFNONE-NEXT:    store <vscale x 2 x i64> [[TMP7]], ptr [[TMP8]], align 8
 ; TFNONE-NEXT:    [[TMP9]] = call double @llvm.vector.reduce.fadd.nxv2f64(double [[VEC_PHI]], <vscale x 2 x double> [[TMP5]])
 ; TFNONE-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
 ; TFNONE-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 2
@@ -633,7 +633,7 @@ define double @test_widen_fmuladd_and_call(ptr noalias %a, ptr readnone %b, doub
 ; TFNONE-NEXT:    [[TOINT:%.*]] = fptoui double [[LOAD]] to i64
 ; TFNONE-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[TOINT]]) #[[ATTR3]]
 ; TFNONE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]
-; TFNONE-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 4
+; TFNONE-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 8
 ; TFNONE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; TFNONE-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1025
 ; TFNONE-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
@@ -665,7 +665,7 @@ define double @test_widen_fmuladd_and_call(ptr noalias %a, ptr readnone %b, doub
 ; TFALWAYS-NEXT:    [[TMP7:%.*]] = fptoui <vscale x 2 x double> [[WIDE_MASKED_LOAD]] to <vscale x 2 x i64>
 ; TFALWAYS-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[TMP7]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFALWAYS-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; TFALWAYS-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP8]], ptr [[TMP9]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFALWAYS-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP8]], ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFALWAYS-NEXT:    [[TMP10:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> [[TMP6]], <vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double -0.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
 ; TFALWAYS-NEXT:    [[TMP11]] = call double @llvm.vector.reduce.fadd.nxv2f64(double [[VEC_PHI]], <vscale x 2 x double> [[TMP10]])
 ; TFALWAYS-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
@@ -702,7 +702,7 @@ define double @test_widen_fmuladd_and_call(ptr noalias %a, ptr readnone %b, doub
 ; TFFALLBACK-NEXT:    [[TMP7:%.*]] = fptoui <vscale x 2 x double> [[WIDE_MASKED_LOAD]] to <vscale x 2 x i64>
 ; TFFALLBACK-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[TMP7]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFFALLBACK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; TFFALLBACK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP8]], ptr [[TMP9]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFFALLBACK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP8]], ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFFALLBACK-NEXT:    [[TMP10:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> [[TMP6]], <vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double -0.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
 ; TFFALLBACK-NEXT:    [[TMP11]] = call double @llvm.vector.reduce.fadd.nxv2f64(double [[VEC_PHI]], <vscale x 2 x double> [[TMP10]])
 ; TFFALLBACK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll
index 94c48de856ed701..04af618014a795c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll
@@ -33,11 +33,11 @@ define i64 @int_reduction_and(ptr noalias nocapture %a, i64 %N) {
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP12]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP12]], align 8
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 2
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i64 [[TMP14]]
-; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i64>, ptr [[TMP15]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i64>, ptr [[TMP15]], align 8
 ; CHECK-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vector.reduce.and.nxv2i64(<vscale x 2 x i64> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP17]] = and i64 [[TMP16]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vector.reduce.and.nxv2i64(<vscale x 2 x i64> [[WIDE_LOAD3]])
@@ -67,7 +67,7 @@ define i64 @int_reduction_and(ptr noalias nocapture %a, i64 %N) {
 ; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[INDEX7]], 0
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP23]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[TMP24]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <2 x i64>, ptr [[TMP25]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <2 x i64>, ptr [[TMP25]], align 8
 ; CHECK-NEXT:    [[TMP26:%.*]] = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> [[WIDE_LOAD9]])
 ; CHECK-NEXT:    [[TMP27]] = and i64 [[TMP26]], [[VEC_PHI8]]
 ; CHECK-NEXT:    [[INDEX_NEXT10]] = add nuw i64 [[INDEX7]], 2
@@ -84,7 +84,7 @@ define i64 @int_reduction_and(ptr noalias nocapture %a, i64 %N) {
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ]
 ; CHECK-NEXT:    [[RDX:%.*]] = phi i64 [ [[AND:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX11]], [[VEC_EPILOG_SCALAR_PH]] ]
 ; CHECK-NEXT:    [[L2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[L3:%.*]] = load i64, ptr [[L2]], align 4
+; CHECK-NEXT:    [[L3:%.*]] = load i64, ptr [[L2]], align 8
 ; CHECK-NEXT:    [[AND]] = and i64 [[RDX]], [[L3]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll
index 9bbbf46e2384422..f84e7c0ea182ba9 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll
@@ -33,11 +33,11 @@ define i64 @int_reduction_add(ptr %a, i64 %N) {
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP12]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP12]], align 8
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 2
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i64 [[TMP14]]
-; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i64>, ptr [[TMP15]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i64>, ptr [[TMP15]], align 8
 ; CHECK-NEXT:    [[TMP16]] = add <vscale x 2 x i64> [[WIDE_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP17]] = add <vscale x 2 x i64> [[WIDE_LOAD3]], [[VEC_PHI2]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
@@ -67,7 +67,7 @@ define i64 @int_reduction_add(ptr %a, i64 %N) {
 ; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[INDEX7]], 0
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP23]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[TMP24]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <2 x i64>, ptr [[TMP25]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <2 x i64>, ptr [[TMP25]], align 8
 ; CHECK-NEXT:    [[TMP26]] = add <2 x i64> [[WIDE_LOAD9]], [[VEC_PHI8]]
 ; CHECK-NEXT:    [[INDEX_NEXT10]] = add nuw i64 [[INDEX7]], 2
 ; CHECK-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT10]], [[N_VEC5]]
@@ -84,7 +84,7 @@ define i64 @int_reduction_add(ptr %a, i64 %N) {
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[SUM:%.*]] = phi i64 [ [[BC_MERGE_RDX11]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP29:%.*]] = load i64, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP29:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
 ; CHECK-NEXT:    [[ADD]] = add i64 [[TMP29]], [[SUM]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll
index 302e2ba7e9b31c6..b378603bbc52ff5 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll
@@ -55,11 +55,11 @@ define ptr @test(ptr %start.1, ptr %start.2, ptr %end) {
 ; CHECK-NEXT:    [[TMP28:%.*]] = mul i64 [[TMP27]], 8
 ; CHECK-NEXT:    [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[START_2]], i64 [[TMP28]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 0
-; CHECK-NEXT:    store <vscale x 2 x i64> zeroinitializer, ptr [[TMP29]], align 4
+; CHECK-NEXT:    store <vscale x 2 x i64> zeroinitializer, ptr [[TMP29]], align 8
 ; CHECK-NEXT:    [[TMP30:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP31:%.*]] = mul i64 [[TMP30]], 2
 ; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i64 [[TMP31]]
-; CHECK-NEXT:    store <vscale x 2 x i64> zeroinitializer, ptr [[TMP32]], align 4
+; CHECK-NEXT:    store <vscale x 2 x i64> zeroinitializer, ptr [[TMP32]], align 8
 ; CHECK-NEXT:    [[TMP33:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP34:%.*]] = mul i64 [[TMP33]], 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP34]]
@@ -79,7 +79,7 @@ define ptr @test(ptr %start.1, ptr %start.2, ptr %end) {
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV_1:%.*]] = phi ptr [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[IV_2:%.*]] = phi ptr [ [[BC_RESUME_VAL4]], [[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    store i64 0, ptr [[IV_2]], align 4
+; CHECK-NEXT:    store i64 0, ptr [[IV_2]], align 8
 ; CHECK-NEXT:    [[IV_2_NEXT]] = getelementptr inbounds ptr, ptr [[IV_2]], i64 1
 ; CHECK-NEXT:    [[IV_1_NEXT]] = getelementptr inbounds ptr, ptr [[IV_1]], i64 1
 ; CHECK-NEXT:    [[CMP_I_I_NOT_I:%.*]] = icmp eq ptr [[IV_2_NEXT]], [[END]]

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll
index 5fc316d0efe3365..166d77148244173 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll
@@ -61,17 +61,17 @@ define void @min_trip_count_due_to_runtime_checks_1(ptr %dst.1, ptr %dst.2, ptr
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i64, ptr [[SRC_2]], i64 [[TMP17]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr i64, ptr [[SRC_2]], i64 [[TMP22]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr i64, ptr [[TMP23]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP27]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP27]], align 8
 ; CHECK-NEXT:    [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP29:%.*]] = mul i64 [[TMP28]], 2
 ; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr i64, ptr [[TMP23]], i64 [[TMP29]]
-; CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <vscale x 2 x i64>, ptr [[TMP30]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <vscale x 2 x i64>, ptr [[TMP30]], align 8
 ; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr i64, ptr [[TMP25]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD13:%.*]] = load <vscale x 2 x i64>, ptr [[TMP31]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD13:%.*]] = load <vscale x 2 x i64>, ptr [[TMP31]], align 8
 ; CHECK-NEXT:    [[TMP32:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP33:%.*]] = mul i64 [[TMP32]], 2
 ; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr i64, ptr [[TMP25]], i64 [[TMP33]]
-; CHECK-NEXT:    [[WIDE_LOAD14:%.*]] = load <vscale x 2 x i64>, ptr [[TMP34]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD14:%.*]] = load <vscale x 2 x i64>, ptr [[TMP34]], align 8
 ; CHECK-NEXT:    [[TMP35:%.*]] = add <vscale x 2 x i64> [[WIDE_LOAD]], [[WIDE_LOAD13]]
 ; CHECK-NEXT:    [[TMP36:%.*]] = add <vscale x 2 x i64> [[WIDE_LOAD12]], [[WIDE_LOAD14]]
 ; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr i64, ptr [[DST_1]], i64 [[TMP17]]
@@ -79,17 +79,17 @@ define void @min_trip_count_due_to_runtime_checks_1(ptr %dst.1, ptr %dst.2, ptr
 ; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr i64, ptr [[DST_2]], i64 [[TMP17]]
 ; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr i64, ptr [[DST_2]], i64 [[TMP22]]
 ; CHECK-NEXT:    [[TMP41:%.*]] = getelementptr i64, ptr [[TMP37]], i32 0
-; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP35]], ptr [[TMP41]], align 4
+; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP35]], ptr [[TMP41]], align 8
 ; CHECK-NEXT:    [[TMP42:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP43:%.*]] = mul i64 [[TMP42]], 2
 ; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr i64, ptr [[TMP37]], i64 [[TMP43]]
-; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP36]], ptr [[TMP44]], align 4
+; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP36]], ptr [[TMP44]], align 8
 ; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr i64, ptr [[TMP39]], i32 0
-; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP35]], ptr [[TMP45]], align 4
+; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP35]], ptr [[TMP45]], align 8
 ; CHECK-NEXT:    [[TMP46:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP47:%.*]] = mul i64 [[TMP46]], 2
 ; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr i64, ptr [[TMP39]], i64 [[TMP47]]
-; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP36]], ptr [[TMP48]], align 4
+; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP36]], ptr [[TMP48]], align 8
 ; CHECK-NEXT:    [[TMP49:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP50:%.*]] = mul i64 [[TMP49]], 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP50]]
@@ -105,13 +105,13 @@ define void @min_trip_count_due_to_runtime_checks_1(ptr %dst.1, ptr %dst.2, ptr
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr i64, ptr [[SRC_1]], i64 [[IV]]
 ; CHECK-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr i64, ptr [[SRC_2]], i64 [[IV]]
-; CHECK-NEXT:    [[L_1:%.*]] = load i64, ptr [[GEP_SRC_1]], align 4
-; CHECK-NEXT:    [[L_2:%.*]] = load i64, ptr [[GEP_SRC_2]], align 4
+; CHECK-NEXT:    [[L_1:%.*]] = load i64, ptr [[GEP_SRC_1]], align 8
+; CHECK-NEXT:    [[L_2:%.*]] = load i64, ptr [[GEP_SRC_2]], align 8
 ; CHECK-NEXT:    [[ADD:%.*]] = add i64 [[L_1]], [[L_2]]
 ; CHECK-NEXT:    [[GEP_DST_1:%.*]] = getelementptr i64, ptr [[DST_1]], i64 [[IV]]
 ; CHECK-NEXT:    [[GEP_DST_2:%.*]] = getelementptr i64, ptr [[DST_2]], i64 [[IV]]
-; CHECK-NEXT:    store i64 [[ADD]], ptr [[GEP_DST_1]], align 4
-; CHECK-NEXT:    store i64 [[ADD]], ptr [[GEP_DST_2]], align 4
+; CHECK-NEXT:    store i64 [[ADD]], ptr [[GEP_DST_1]], align 8
+; CHECK-NEXT:    store i64 [[ADD]], ptr [[GEP_DST_2]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[CMP10:%.*]] = icmp ult i64 [[IV_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[CMP10]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll
index cbded740efe4f4d..eaabc263913a7db 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll
@@ -185,11 +185,11 @@ define void @test_v4_v4m(ptr noalias %a, ptr readonly %b) #3 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i64> @foo_vector_fixed4_nomask(<4 x i64> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[TMP5]], align 4
+; CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -201,10 +201,10 @@ define void @test_v4_v4m(ptr noalias %a, ptr readonly %b) #3 {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4
+; CHECK-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8
 ; CHECK-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR1:[0-9]+]]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 8
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
@@ -241,11 +241,11 @@ define void @test_v2_v4m(ptr noalias %a, ptr readonly %b) #3 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i64> @foo_vector_fixed4_mask(<4 x i64> [[WIDE_LOAD]], <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[TMP5]], align 4
+; CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -257,10 +257,10 @@ define void @test_v2_v4m(ptr noalias %a, ptr readonly %b) #3 {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4
+; CHECK-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8
 ; CHECK-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR2:[0-9]+]]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 8
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
@@ -297,11 +297,11 @@ define void @test_v2_v4(ptr noalias %a, ptr readonly %b) #3 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i64> @foo_vector_fixed4_nomask(<4 x i64> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[TMP5]], align 4
+; CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -313,10 +313,10 @@ define void @test_v2_v4(ptr noalias %a, ptr readonly %b) #3 {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4
+; CHECK-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8
 ; CHECK-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR3:[0-9]+]]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 8
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]

diff  --git a/llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-call.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-call.ll
index 240882bab636dd7..60619f5de09372a 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-call.ll
@@ -8,21 +8,20 @@ define dso_local double @test(ptr %Arr) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x double> [ zeroinitializer, [[ENTRY]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[INDEX]] to i64
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[ARR:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = fpext <2 x float> [[WIDE_LOAD]] to <2 x double>
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast <2 x double> @__sind2(<2 x double> [[TMP2]])
-; CHECK-NEXT:    [[TMP4]] = fadd fast <2 x double> [[TMP3]], [[VEC_PHI]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 128
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x double> [ zeroinitializer, [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, ptr [[ARR:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = fpext <2 x float> [[WIDE_LOAD]] to <2 x double>
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast <2 x double> @__sind2(<2 x double> [[TMP1]])
+; CHECK-NEXT:    [[TMP3]] = fadd fast <2 x double> [[TMP2]], [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
+; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi <2 x double> [ [[TMP4]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = tail call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[DOTLCSSA]])
-; CHECK-NEXT:    ret double [[TMP6]]
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi <2 x double> [ [[TMP3]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[DOTLCSSA]])
+; CHECK-NEXT:    ret double [[TMP5]]
 ;
 entry:
   br label %for.cond

diff  --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses-zve32x.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses-zve32x.ll
index 241fa9d8cd49b33..bf148917dca10dc 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses-zve32x.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses-zve32x.ll
@@ -10,14 +10,14 @@ define void @load_store_zve32x(ptr %p) {
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
 ; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 4
+; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
 ; CHECK-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 4
+; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
 ; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
-; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 4
+; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; CHECK-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
-; CHECK-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 4
+; CHECK-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 8
 ; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
 ; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
 ; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT:%.*]], label [[LOOP]]

diff  --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
index 99f03cdcf648c02..29ef7364b821286 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
@@ -77,22 +77,49 @@ exit:
 define void @load_store_factor2_i64(ptr %p) {
 ; CHECK-LABEL: @load_store_factor2_i64(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i64, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP3]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[STRIDED_VEC1]], <i64 2, i64 2, i64 2, i64 2>
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[TMP6]], i32 -1
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 4
+; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
+; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
 ; CHECK-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 4
+; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
 ; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
-; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 4
+; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; CHECK-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
-; CHECK-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 4
+; CHECK-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 8
 ; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
 ; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -150,7 +177,7 @@ define void @load_store_factor3_i32(ptr %p) {
 ; CHECK-NEXT:    store <6 x i32> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -175,7 +202,7 @@ define void @load_store_factor3_i32(ptr %p) {
 ; CHECK-NEXT:    store i32 [[Y2]], ptr [[Q2]], align 4
 ; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
 ; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -212,27 +239,75 @@ exit:
 define void @load_store_factor3_i64(ptr %p) {
 ; CHECK-LABEL: @load_store_factor3_i64(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = add <vscale x 2 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = mul <vscale x 2 x i64> [[TMP5]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 1, [[TMP8]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP9]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = mul <vscale x 2 x i64> [[VEC_IND]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 3, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i64, ptr [[P:%.*]], <vscale x 2 x i64> [[TMP10]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP11]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP12]], <vscale x 2 x ptr> [[TMP11]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 2 x i64> [[TMP10]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i64, ptr [[P]], <vscale x 2 x i64> [[TMP13]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP14]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP15:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER1]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 2, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP15]], <vscale x 2 x ptr> [[TMP14]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP16:%.*]] = add <vscale x 2 x i64> [[TMP13]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i64, ptr [[P]], <vscale x 2 x i64> [[TMP16]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP17]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP18:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER2]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 3, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP18]], <vscale x 2 x ptr> [[TMP17]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP19]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP20]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[OFFSET0:%.*]] = mul i64 [[I]], 3
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 4
+; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
+; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
 ; CHECK-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 4
+; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
 ; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
-; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 4
+; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; CHECK-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
-; CHECK-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 4
+; CHECK-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 8
 ; CHECK-NEXT:    [[OFFSET2:%.*]] = add i64 [[OFFSET1]], 1
 ; CHECK-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
-; CHECK-NEXT:    [[X2:%.*]] = load i64, ptr [[Q2]], align 4
+; CHECK-NEXT:    [[X2:%.*]] = load i64, ptr [[Q2]], align 8
 ; CHECK-NEXT:    [[Y2:%.*]] = add i64 [[X2]], 3
-; CHECK-NEXT:    store i64 [[Y2]], ptr [[Q2]], align 4
+; CHECK-NEXT:    store i64 [[Y2]], ptr [[Q2]], align 8
 ; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
 ; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -269,52 +344,125 @@ exit:
 define void @load_store_factor8(ptr %p) {
 ; CHECK-LABEL: @load_store_factor8(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = add <vscale x 2 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = mul <vscale x 2 x i64> [[TMP5]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 1, [[TMP8]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP9]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = shl <vscale x 2 x i64> [[VEC_IND]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 3, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i64, ptr [[P:%.*]], <vscale x 2 x i64> [[TMP10]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP11]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP12]], <vscale x 2 x ptr> [[TMP11]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 2 x i64> [[TMP10]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i64, ptr [[P]], <vscale x 2 x i64> [[TMP13]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP14]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP15:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER1]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 2, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP15]], <vscale x 2 x ptr> [[TMP14]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP16:%.*]] = add <vscale x 2 x i64> [[TMP13]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i64, ptr [[P]], <vscale x 2 x i64> [[TMP16]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP17]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP18:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER2]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 3, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP18]], <vscale x 2 x ptr> [[TMP17]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP19:%.*]] = add <vscale x 2 x i64> [[TMP16]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i64, ptr [[P]], <vscale x 2 x i64> [[TMP19]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP20]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP21:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER3]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 4, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP21]], <vscale x 2 x ptr> [[TMP20]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP22:%.*]] = add <vscale x 2 x i64> [[TMP19]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i64, ptr [[P]], <vscale x 2 x i64> [[TMP22]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP23]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP24:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER4]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 5, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP24]], <vscale x 2 x ptr> [[TMP23]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP25:%.*]] = add <vscale x 2 x i64> [[TMP22]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr i64, ptr [[P]], <vscale x 2 x i64> [[TMP25]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP26]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP27:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER5]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 6, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP27]], <vscale x 2 x ptr> [[TMP26]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP28:%.*]] = add <vscale x 2 x i64> [[TMP25]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i64, ptr [[P]], <vscale x 2 x i64> [[TMP28]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP29]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP30:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER6]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 7, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP30]], <vscale x 2 x ptr> [[TMP29]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP31:%.*]] = add <vscale x 2 x i64> [[TMP28]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr i64, ptr [[P]], <vscale x 2 x i64> [[TMP31]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP32]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP33:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 8, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP33]], <vscale x 2 x ptr> [[TMP32]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP34:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP35:%.*]] = mul i64 [[TMP34]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP35]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 3
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 4
+; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
+; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
 ; CHECK-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 4
+; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
 ; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
-; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 4
+; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; CHECK-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
-; CHECK-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 4
+; CHECK-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 8
 ; CHECK-NEXT:    [[OFFSET2:%.*]] = add i64 [[OFFSET1]], 1
 ; CHECK-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
-; CHECK-NEXT:    [[X2:%.*]] = load i64, ptr [[Q2]], align 4
+; CHECK-NEXT:    [[X2:%.*]] = load i64, ptr [[Q2]], align 8
 ; CHECK-NEXT:    [[Y2:%.*]] = add i64 [[X2]], 3
-; CHECK-NEXT:    store i64 [[Y2]], ptr [[Q2]], align 4
+; CHECK-NEXT:    store i64 [[Y2]], ptr [[Q2]], align 8
 ; CHECK-NEXT:    [[OFFSET3:%.*]] = add i64 [[OFFSET2]], 1
 ; CHECK-NEXT:    [[Q3:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET3]]
-; CHECK-NEXT:    [[X3:%.*]] = load i64, ptr [[Q3]], align 4
+; CHECK-NEXT:    [[X3:%.*]] = load i64, ptr [[Q3]], align 8
 ; CHECK-NEXT:    [[Y3:%.*]] = add i64 [[X3]], 4
-; CHECK-NEXT:    store i64 [[Y3]], ptr [[Q3]], align 4
+; CHECK-NEXT:    store i64 [[Y3]], ptr [[Q3]], align 8
 ; CHECK-NEXT:    [[OFFSET4:%.*]] = add i64 [[OFFSET3]], 1
 ; CHECK-NEXT:    [[Q4:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET4]]
-; CHECK-NEXT:    [[X4:%.*]] = load i64, ptr [[Q4]], align 4
+; CHECK-NEXT:    [[X4:%.*]] = load i64, ptr [[Q4]], align 8
 ; CHECK-NEXT:    [[Y4:%.*]] = add i64 [[X4]], 5
-; CHECK-NEXT:    store i64 [[Y4]], ptr [[Q4]], align 4
+; CHECK-NEXT:    store i64 [[Y4]], ptr [[Q4]], align 8
 ; CHECK-NEXT:    [[OFFSET5:%.*]] = add i64 [[OFFSET4]], 1
 ; CHECK-NEXT:    [[Q5:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET5]]
-; CHECK-NEXT:    [[X5:%.*]] = load i64, ptr [[Q5]], align 4
+; CHECK-NEXT:    [[X5:%.*]] = load i64, ptr [[Q5]], align 8
 ; CHECK-NEXT:    [[Y5:%.*]] = add i64 [[X5]], 6
-; CHECK-NEXT:    store i64 [[Y5]], ptr [[Q5]], align 4
+; CHECK-NEXT:    store i64 [[Y5]], ptr [[Q5]], align 8
 ; CHECK-NEXT:    [[OFFSET6:%.*]] = add i64 [[OFFSET5]], 1
 ; CHECK-NEXT:    [[Q6:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET6]]
-; CHECK-NEXT:    [[X6:%.*]] = load i64, ptr [[Q6]], align 4
+; CHECK-NEXT:    [[X6:%.*]] = load i64, ptr [[Q6]], align 8
 ; CHECK-NEXT:    [[Y6:%.*]] = add i64 [[X6]], 7
-; CHECK-NEXT:    store i64 [[Y6]], ptr [[Q6]], align 4
+; CHECK-NEXT:    store i64 [[Y6]], ptr [[Q6]], align 8
 ; CHECK-NEXT:    [[OFFSET7:%.*]] = add i64 [[OFFSET6]], 1
 ; CHECK-NEXT:    [[Q7:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET7]]
-; CHECK-NEXT:    [[X7:%.*]] = load i64, ptr [[Q7]], align 4
+; CHECK-NEXT:    [[X7:%.*]] = load i64, ptr [[Q7]], align 8
 ; CHECK-NEXT:    [[Y7:%.*]] = add i64 [[X7]], 8
-; CHECK-NEXT:    store i64 [[Y7]], ptr [[Q7]], align 4
+; CHECK-NEXT:    store i64 [[Y7]], ptr [[Q7]], align 8
 ; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
 ; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -410,7 +558,7 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) {
 ; CHECK-NEXT:    store <8 x i32> [[TMP9]], ptr [[TMP13]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -429,7 +577,7 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) {
 ; CHECK-NEXT:    store i32 [[RES]], ptr [[DST]], align 4
 ; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
 ; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -461,21 +609,55 @@ exit:
 define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) {
 ; CHECK-LABEL: @combine_load_factor2_i64(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <8 x i64>, ptr [[TMP7]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <8 x i64> [[WIDE_VEC1]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <8 x i64> [[WIDE_VEC1]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i64> [[STRIDED_VEC]], [[STRIDED_VEC3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], [[STRIDED_VEC4]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i64, ptr [[Q]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[TMP10]], i32 0
+; CHECK-NEXT:    store <4 x i64> [[TMP8]], ptr [[TMP12]], align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[TMP10]], i32 4
+; CHECK-NEXT:    store <4 x i64> [[TMP9]], ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 4
+; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
+; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
 ; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
-; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 4
+; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; CHECK-NEXT:    [[RES:%.*]] = add i64 [[X0]], [[X1]]
-; CHECK-NEXT:    [[DST:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[I]]
-; CHECK-NEXT:    store i64 [[RES]], ptr [[DST]], align 4
+; CHECK-NEXT:    [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I]]
+; CHECK-NEXT:    store i64 [[RES]], ptr [[DST]], align 8
 ; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
 ; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll b/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll
index 9b3b90a7bc3b613..072cef66c00f434 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll
@@ -21,9 +21,9 @@ define void @load_store(ptr %p) {
 ; LMUL1-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
 ; LMUL1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[TMP2]]
 ; LMUL1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; LMUL1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i64>, ptr [[TMP4]], align 4
+; LMUL1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i64>, ptr [[TMP4]], align 8
 ; LMUL1-NEXT:    [[TMP5:%.*]] = add <vscale x 1 x i64> [[WIDE_LOAD]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
-; LMUL1-NEXT:    store <vscale x 1 x i64> [[TMP5]], ptr [[TMP4]], align 4
+; LMUL1-NEXT:    store <vscale x 1 x i64> [[TMP5]], ptr [[TMP4]], align 8
 ; LMUL1-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
 ; LMUL1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
 ; LMUL1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -37,9 +37,9 @@ define void @load_store(ptr %p) {
 ; LMUL1:       for.body:
 ; LMUL1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; LMUL1-NEXT:    [[Q:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[IV]]
-; LMUL1-NEXT:    [[V:%.*]] = load i64, ptr [[Q]], align 4
+; LMUL1-NEXT:    [[V:%.*]] = load i64, ptr [[Q]], align 8
 ; LMUL1-NEXT:    [[W:%.*]] = add i64 [[V]], 1
-; LMUL1-NEXT:    store i64 [[W]], ptr [[Q]], align 4
+; LMUL1-NEXT:    store i64 [[W]], ptr [[Q]], align 8
 ; LMUL1-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; LMUL1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
 ; LMUL1-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
@@ -63,9 +63,9 @@ define void @load_store(ptr %p) {
 ; LMUL2-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
 ; LMUL2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[TMP4]]
 ; LMUL2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
-; LMUL2-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP6]], align 4
+; LMUL2-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP6]], align 8
 ; LMUL2-NEXT:    [[TMP7:%.*]] = add <vscale x 2 x i64> [[WIDE_LOAD]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; LMUL2-NEXT:    store <vscale x 2 x i64> [[TMP7]], ptr [[TMP6]], align 4
+; LMUL2-NEXT:    store <vscale x 2 x i64> [[TMP7]], ptr [[TMP6]], align 8
 ; LMUL2-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; LMUL2-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
 ; LMUL2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
@@ -80,9 +80,9 @@ define void @load_store(ptr %p) {
 ; LMUL2:       for.body:
 ; LMUL2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; LMUL2-NEXT:    [[Q:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[IV]]
-; LMUL2-NEXT:    [[V:%.*]] = load i64, ptr [[Q]], align 4
+; LMUL2-NEXT:    [[V:%.*]] = load i64, ptr [[Q]], align 8
 ; LMUL2-NEXT:    [[W:%.*]] = add i64 [[V]], 1
-; LMUL2-NEXT:    store i64 [[W]], ptr [[Q]], align 4
+; LMUL2-NEXT:    store i64 [[W]], ptr [[Q]], align 8
 ; LMUL2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; LMUL2-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
 ; LMUL2-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
@@ -106,9 +106,9 @@ define void @load_store(ptr %p) {
 ; LMUL4-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
 ; LMUL4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[TMP4]]
 ; LMUL4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
-; LMUL4-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i64>, ptr [[TMP6]], align 4
+; LMUL4-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i64>, ptr [[TMP6]], align 8
 ; LMUL4-NEXT:    [[TMP7:%.*]] = add <vscale x 4 x i64> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; LMUL4-NEXT:    store <vscale x 4 x i64> [[TMP7]], ptr [[TMP6]], align 4
+; LMUL4-NEXT:    store <vscale x 4 x i64> [[TMP7]], ptr [[TMP6]], align 8
 ; LMUL4-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; LMUL4-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
 ; LMUL4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
@@ -123,9 +123,9 @@ define void @load_store(ptr %p) {
 ; LMUL4:       for.body:
 ; LMUL4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; LMUL4-NEXT:    [[Q:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[IV]]
-; LMUL4-NEXT:    [[V:%.*]] = load i64, ptr [[Q]], align 4
+; LMUL4-NEXT:    [[V:%.*]] = load i64, ptr [[Q]], align 8
 ; LMUL4-NEXT:    [[W:%.*]] = add i64 [[V]], 1
-; LMUL4-NEXT:    store i64 [[W]], ptr [[Q]], align 4
+; LMUL4-NEXT:    store i64 [[W]], ptr [[Q]], align 8
 ; LMUL4-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; LMUL4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
 ; LMUL4-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
@@ -149,9 +149,9 @@ define void @load_store(ptr %p) {
 ; LMUL8-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
 ; LMUL8-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[TMP4]]
 ; LMUL8-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
-; LMUL8-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i64>, ptr [[TMP6]], align 4
+; LMUL8-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i64>, ptr [[TMP6]], align 8
 ; LMUL8-NEXT:    [[TMP7:%.*]] = add <vscale x 8 x i64> [[WIDE_LOAD]], shufflevector (<vscale x 8 x i64> insertelement (<vscale x 8 x i64> poison, i64 1, i64 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer)
-; LMUL8-NEXT:    store <vscale x 8 x i64> [[TMP7]], ptr [[TMP6]], align 4
+; LMUL8-NEXT:    store <vscale x 8 x i64> [[TMP7]], ptr [[TMP6]], align 8
 ; LMUL8-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; LMUL8-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 8
 ; LMUL8-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
@@ -166,9 +166,9 @@ define void @load_store(ptr %p) {
 ; LMUL8:       for.body:
 ; LMUL8-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; LMUL8-NEXT:    [[Q:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[IV]]
-; LMUL8-NEXT:    [[V:%.*]] = load i64, ptr [[Q]], align 4
+; LMUL8-NEXT:    [[V:%.*]] = load i64, ptr [[Q]], align 8
 ; LMUL8-NEXT:    [[W:%.*]] = add i64 [[V]], 1
-; LMUL8-NEXT:    store i64 [[W]], ptr [[Q]], align 4
+; LMUL8-NEXT:    store i64 [[W]], ptr [[Q]], align 8
 ; LMUL8-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; LMUL8-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
 ; LMUL8-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]

diff  --git a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
index fab924580cbb70d..72e3bca79a5a537 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
@@ -22,9 +22,9 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; RV32-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 625, [[TMP2]]
 ; RV32-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; RV32:       vector.memcheck:
-; RV32-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 79880
-; RV32-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 39940
-; RV32-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 159752
+; RV32-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i32 79880
+; RV32-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i32 39940
+; RV32-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i32 159752
 ; RV32-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]]
 ; RV32-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[SCEVGEP]]
 ; RV32-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]

diff  --git a/llvm/test/Transforms/MemCpyOpt/no-libcalls.ll b/llvm/test/Transforms/MemCpyOpt/no-libcalls.ll
index 04cbc3ee8cf3416..8d48a20f4ce5bbf 100644
--- a/llvm/test/Transforms/MemCpyOpt/no-libcalls.ll
+++ b/llvm/test/Transforms/MemCpyOpt/no-libcalls.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -passes=memcpyopt < %s | FileCheck %s --check-prefixes=CHECK,LIBCALLS
+; RUN: opt -S -passes=memcpyopt -mtriple=x86_64 < %s | FileCheck %s --check-prefixes=CHECK,LIBCALLS
 ; RUN: opt -S -passes=memcpyopt -mtriple=amdgcn-- < %s | FileCheck %s --check-prefixes=CHECK,NO-LIBCALLS
 ; RUN: opt -S -passes=memcpyopt -mtriple=amdgcn-- -enable-memcpyopt-without-libcalls < %s \
 ; RUN:     | FileCheck %s --check-prefixes=CHECK,LIBCALLS
@@ -38,12 +38,12 @@ define void @dont_create_memset(ptr %p) {
 
 define void @dont_create_memcpy(ptr %p1, ptr %p2) {
 ; LIBCALLS-LABEL: @dont_create_memcpy(
-; LIBCALLS-NEXT:    call void @llvm.memmove.p0.p0.i64(ptr align 4 [[P2:%.*]], ptr align 4 [[P1:%.*]], i64 8, i1 false)
+; LIBCALLS-NEXT:    call void @llvm.memmove.p0.p0.i64(ptr align 8 [[P2:%.*]], ptr align 8 [[P1:%.*]], i64 8, i1 false)
 ; LIBCALLS-NEXT:    ret void
 ;
 ; NO-LIBCALLS-LABEL: @dont_create_memcpy(
-; NO-LIBCALLS-NEXT:    [[V:%.*]] = load [[TY:%.*]], ptr [[P1:%.*]], align 4
-; NO-LIBCALLS-NEXT:    store [[TY]] [[V]], ptr [[P2:%.*]], align 4
+; NO-LIBCALLS-NEXT:    [[V:%.*]] = load [[TY:%.*]], ptr [[P1:%.*]], align 8
+; NO-LIBCALLS-NEXT:    store [[TY]] [[V]], ptr [[P2:%.*]], align 8
 ; NO-LIBCALLS-NEXT:    ret void
 ;
   %v = load %ty, ptr %p1

diff  --git a/llvm/test/Transforms/MergeICmps/X86/addressspaces.ll b/llvm/test/Transforms/MergeICmps/X86/addressspaces.ll
index fc85f57e020c103..f3541630c804fdb 100644
--- a/llvm/test/Transforms/MergeICmps/X86/addressspaces.ll
+++ b/llvm/test/Transforms/MergeICmps/X86/addressspaces.ll
@@ -46,13 +46,13 @@ define void @no_memcmp(ptr addrspace(11) dereferenceable(16) %a, ptr addrspace(1
 ; CHECK-NEXT:    [[PTR_B1:%.*]] = getelementptr inbounds [2 x i64], ptr addrspace(11) [[B:%.*]], i64 0, i64 1
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[A0:%.*]] = load i64, ptr addrspace(11) [[A]], align 4
-; CHECK-NEXT:    [[B0:%.*]] = load i64, ptr addrspace(11) [[B]], align 4
+; CHECK-NEXT:    [[A0:%.*]] = load i64, ptr addrspace(11) [[A]], align 8
+; CHECK-NEXT:    [[B0:%.*]] = load i64, ptr addrspace(11) [[B]], align 8
 ; CHECK-NEXT:    [[COND0:%.*]] = icmp eq i64 [[A0]], [[B0]]
 ; CHECK-NEXT:    br i1 [[COND0]], label [[BB2:%.*]], label [[BB3:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[A1:%.*]] = load i64, ptr addrspace(11) [[PTR_A1]], align 4
-; CHECK-NEXT:    [[B1:%.*]] = load i64, ptr addrspace(11) [[PTR_B1]], align 4
+; CHECK-NEXT:    [[A1:%.*]] = load i64, ptr addrspace(11) [[PTR_A1]], align 8
+; CHECK-NEXT:    [[B1:%.*]] = load i64, ptr addrspace(11) [[PTR_B1]], align 8
 ; CHECK-NEXT:    [[COND1:%.*]] = icmp eq i64 [[A1]], [[B1]]
 ; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb3:

diff  --git a/llvm/test/Transforms/NewGVN/refine-stores.ll b/llvm/test/Transforms/NewGVN/refine-stores.ll
index a45dd1f48e1c095..eb5acd276a597b0 100644
--- a/llvm/test/Transforms/NewGVN/refine-stores.ll
+++ b/llvm/test/Transforms/NewGVN/refine-stores.ll
@@ -56,7 +56,7 @@ define void @a() {
 ; CHECK-NEXT:  b:
 ; CHECK-NEXT:    br label [[C:%.*]]
 ; CHECK:       c:
-; CHECK-NEXT:    store i64 undef, ptr null, align 4
+; CHECK-NEXT:    store i64 undef, ptr null, align 8
 ; CHECK-NEXT:    br label [[E:%.*]]
 ; CHECK:       e:
 ; CHECK-NEXT:    store ptr undef, ptr null, align 8
@@ -93,9 +93,9 @@ define void @widget(ptr %arg) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[TMP3]], 0
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[BB7]], label [[BB5:%.*]]
 ; CHECK:       bb5:
-; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr null, align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr null, align 8
 ; CHECK-NEXT:    call void @quux()
-; CHECK-NEXT:    store i64 [[TMP6]], ptr undef, align 4
+; CHECK-NEXT:    store i64 [[TMP6]], ptr undef, align 8
 ; CHECK-NEXT:    br label [[BB7]]
 ; CHECK:       bb7:
 ; CHECK-NEXT:    [[TMP8]] = add i64 [[TMP3]], 1

diff  --git a/llvm/test/Transforms/OpenMP/barrier_removal.ll b/llvm/test/Transforms/OpenMP/barrier_removal.ll
index 8aca820afdfa883..3040c1721d36347 100644
--- a/llvm/test/Transforms/OpenMP/barrier_removal.ll
+++ b/llvm/test/Transforms/OpenMP/barrier_removal.ll
@@ -329,7 +329,7 @@ define void @neg_loads() "kernel" {
 define void @pos_priv_mem() "kernel" {
 ; CHECK-LABEL: define {{[^@]+}}@pos_priv_mem
 ; CHECK-SAME: () #[[ATTR4]] {
-; CHECK-NEXT:    [[ARG:%.*]] = load ptr addrspace(5), ptr @GPtr5, align 8
+; CHECK-NEXT:    [[ARG:%.*]] = load ptr addrspace(5), ptr @GPtr5, align 4
 ; CHECK-NEXT:    [[LOC:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[A:%.*]] = load i32, ptr @PG1, align 4
 ; CHECK-NEXT:    store i32 [[A]], ptr [[LOC]], align 4

diff  --git a/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll
index dd6d2e8270d9c54..40a5c73b69de9ca 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll
@@ -41,49 +41,47 @@ return:                                           ; preds = %if.end3, %if.then2,
 define void @loop(ptr %X, ptr %Y) {
 ; CHECK-LABEL: @loop(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[X5:%.*]] = ptrtoint ptr [[X:%.*]] to i64
-; CHECK-NEXT:    [[Y6:%.*]] = ptrtoint ptr [[Y:%.*]] to i64
-; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[X5]], [[Y6]]
+; CHECK-NEXT:    [[X6:%.*]] = ptrtoint ptr [[X:%.*]] to i64
+; CHECK-NEXT:    [[Y7:%.*]] = ptrtoint ptr [[Y:%.*]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[X6]], [[Y7]]
 ; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32
 ; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[INDEX]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[TMP1]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP2]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i64 2
-; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = fcmp olt <2 x double> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = fcmp olt <2 x double> [[WIDE_LOAD7]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = fcmp ogt <2 x double> [[WIDE_LOAD]], <double 6.000000e+00, double 6.000000e+00>
-; CHECK-NEXT:    [[TMP7:%.*]] = fcmp ogt <2 x double> [[WIDE_LOAD7]], <double 6.000000e+00, double 6.000000e+00>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <2 x i1> [[TMP6]], <2 x double> <double 6.000000e+00, double 6.000000e+00>, <2 x double> [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP9:%.*]] = select <2 x i1> [[TMP7]], <2 x double> <double 6.000000e+00, double 6.000000e+00>, <2 x double> [[WIDE_LOAD7]]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i64 2
+; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <2 x double>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp olt <2 x double> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = fcmp olt <2 x double> [[WIDE_LOAD8]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = fcmp ogt <2 x double> [[WIDE_LOAD]], <double 6.000000e+00, double 6.000000e+00>
+; CHECK-NEXT:    [[TMP6:%.*]] = fcmp ogt <2 x double> [[WIDE_LOAD8]], <double 6.000000e+00, double 6.000000e+00>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <2 x i1> [[TMP5]], <2 x double> <double 6.000000e+00, double 6.000000e+00>, <2 x double> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP8:%.*]] = select <2 x i1> [[TMP6]], <2 x double> <double 6.000000e+00, double 6.000000e+00>, <2 x double> [[WIDE_LOAD8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select <2 x i1> [[TMP3]], <2 x double> zeroinitializer, <2 x double> [[TMP7]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = select <2 x i1> [[TMP4]], <2 x double> zeroinitializer, <2 x double> [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select <2 x i1> [[TMP5]], <2 x double> zeroinitializer, <2 x double> [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDEX]]
+; CHECK-NEXT:    store <2 x double> [[TMP9]], ptr [[TMP11]], align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, ptr [[TMP11]], i64 2
 ; CHECK-NEXT:    store <2 x double> [[TMP10]], ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds double, ptr [[TMP12]], i64 2
-; CHECK-NEXT:    store <2 x double> [[TMP11]], ptr [[TMP13]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], 20000
-; CHECK-NEXT:    br i1 [[TMP14]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 20000
+; CHECK-NEXT:    br i1 [[TMP13]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    ret void
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[I_04:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I_04]] to i64
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[IDXPROM]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load double, ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[CMP_I:%.*]] = fcmp olt double [[TMP15]], 0.000000e+00
-; CHECK-NEXT:    [[CMP1_I:%.*]] = fcmp ogt double [[TMP15]], 6.000000e+00
-; CHECK-NEXT:    [[DOTV_I:%.*]] = select i1 [[CMP1_I]], double 6.000000e+00, double [[TMP15]]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load double, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[CMP_I:%.*]] = fcmp olt double [[TMP14]], 0.000000e+00
+; CHECK-NEXT:    [[CMP1_I:%.*]] = fcmp ogt double [[TMP14]], 6.000000e+00
+; CHECK-NEXT:    [[DOTV_I:%.*]] = select i1 [[CMP1_I]], double 6.000000e+00, double [[TMP14]]
 ; CHECK-NEXT:    [[RETVAL_0_I:%.*]] = select i1 [[CMP_I]], double 0.000000e+00, double [[DOTV_I]]
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    store double [[RETVAL_0_I]], ptr [[ARRAYIDX2]], align 8
-; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_04]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[I_04]], 19999
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 20000
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ;
 entry:
   %X.addr = alloca ptr, align 8
@@ -193,8 +191,8 @@ define void @loop2(ptr %A, ptr %B, ptr %C, float %x) {
 ; CHECK-NEXT:    [[ADD_SINK:%.*]] = phi float [ [[ADD]], [[ELSE]] ], [ [[MUL2_I81_I]], [[LOOP_BODY]] ]
 ; CHECK-NEXT:    store float [[ADD_SINK]], ptr [[B_GEP_0]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
-; CHECK-NEXT:    [[CMP_0:%.*]] = icmp ult i64 [[IV1]], 9999
-; CHECK-NEXT:    br i1 [[CMP_0]], label [[LOOP_BODY]], label [[EXIT]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 10000
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll
index 2db35c4e03e4dcc..d048b0bab417676 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll
@@ -92,21 +92,20 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <225 x double>, ptr [[B:%.*]], i64 0, i64 [[CONV6]]
 ; CHECK-NEXT:    br label [[FOR_BODY4_US:%.*]]
 ; CHECK:       for.body4.us:
-; CHECK-NEXT:    [[K_011_US:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_US]] ], [ [[INC_US:%.*]], [[FOR_BODY4_US]] ]
-; CHECK-NEXT:    [[CONV_US:%.*]] = zext i32 [[K_011_US]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i32 [[K_011_US]], 225
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER_US]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY4_US]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i64 [[INDVARS_IV]], 225
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP2]])
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <225 x double>, ptr [[A:%.*]], i64 0, i64 [[CONV_US]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <225 x double>, ptr [[A:%.*]], i64 0, i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[MATRIXEXT_US:%.*]] = load double, ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[MATRIXEXT8_US:%.*]] = load double, ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[MUL_US:%.*]] = fmul double [[MATRIXEXT_US]], [[MATRIXEXT8_US]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds <225 x double>, ptr [[B]], i64 0, i64 [[CONV_US]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds <225 x double>, ptr [[B]], i64 0, i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[MATRIXEXT11_US:%.*]] = load double, ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[SUB_US:%.*]] = fsub double [[MATRIXEXT11_US]], [[MUL_US]]
 ; CHECK-NEXT:    store double [[SUB_US]], ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[INC_US]] = add nuw nsw i32 [[K_011_US]], 1
-; CHECK-NEXT:    [[CMP2_US:%.*]] = icmp ult i32 [[INC_US]], [[I]]
-; CHECK-NEXT:    br i1 [[CMP2_US]], label [[FOR_BODY4_US]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US:%.*]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[CONV6]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US:%.*]], label [[FOR_BODY4_US]]
 ; CHECK:       for.cond1.for.cond.cleanup3_crit_edge.us:
 ; CHECK-NEXT:    [[TMP5:%.*]] = add nuw nsw i64 [[CONV6]], 15
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[I]], 210
@@ -114,10 +113,9 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <225 x double>, ptr [[B]], i64 0, i64 [[TMP5]]
 ; CHECK-NEXT:    br label [[FOR_BODY4_US_1:%.*]]
 ; CHECK:       for.body4.us.1:
-; CHECK-NEXT:    [[K_011_US_1:%.*]] = phi i32 [ 0, [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]] ], [ [[INC_US_1:%.*]], [[FOR_BODY4_US_1]] ]
-; CHECK-NEXT:    [[CONV_US_1:%.*]] = zext i32 [[K_011_US_1]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = add nuw nsw i64 [[CONV_US_1]], 15
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[K_011_US_1]], 210
+; CHECK-NEXT:    [[INDVARS_IV_1:%.*]] = phi i64 [ 0, [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]] ], [ [[INDVARS_IV_NEXT_1:%.*]], [[FOR_BODY4_US_1]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = add nuw nsw i64 [[INDVARS_IV_1]], 15
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ult i64 [[INDVARS_IV_1]], 210
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP9]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds <225 x double>, ptr [[A]], i64 0, i64 [[TMP8]]
 ; CHECK-NEXT:    [[MATRIXEXT_US_1:%.*]] = load double, ptr [[TMP10]], align 8
@@ -127,9 +125,9 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea
 ; CHECK-NEXT:    [[MATRIXEXT11_US_1:%.*]] = load double, ptr [[TMP11]], align 8
 ; CHECK-NEXT:    [[SUB_US_1:%.*]] = fsub double [[MATRIXEXT11_US_1]], [[MUL_US_1]]
 ; CHECK-NEXT:    store double [[SUB_US_1]], ptr [[TMP11]], align 8
-; CHECK-NEXT:    [[INC_US_1]] = add nuw nsw i32 [[K_011_US_1]], 1
-; CHECK-NEXT:    [[CMP2_US_1:%.*]] = icmp ult i32 [[INC_US_1]], [[I]]
-; CHECK-NEXT:    br i1 [[CMP2_US_1]], label [[FOR_BODY4_US_1]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_1:%.*]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT_1]] = add nuw nsw i64 [[INDVARS_IV_1]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT_1:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_1]], [[CONV6]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT_1]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_1:%.*]], label [[FOR_BODY4_US_1]]
 ; CHECK:       for.cond1.for.cond.cleanup3_crit_edge.us.1:
 ; CHECK-NEXT:    [[TMP12:%.*]] = add nuw nsw i64 [[CONV6]], 30
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp ult i32 [[I]], 195
@@ -137,10 +135,9 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds <225 x double>, ptr [[B]], i64 0, i64 [[TMP12]]
 ; CHECK-NEXT:    br label [[FOR_BODY4_US_2:%.*]]
 ; CHECK:       for.body4.us.2:
-; CHECK-NEXT:    [[K_011_US_2:%.*]] = phi i32 [ 0, [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_1]] ], [ [[INC_US_2:%.*]], [[FOR_BODY4_US_2]] ]
-; CHECK-NEXT:    [[CONV_US_2:%.*]] = zext i32 [[K_011_US_2]] to i64
-; CHECK-NEXT:    [[TMP15:%.*]] = add nuw nsw i64 [[CONV_US_2]], 30
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp ult i32 [[K_011_US_2]], 195
+; CHECK-NEXT:    [[INDVARS_IV_2:%.*]] = phi i64 [ 0, [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_1]] ], [ [[INDVARS_IV_NEXT_2:%.*]], [[FOR_BODY4_US_2]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = add nuw nsw i64 [[INDVARS_IV_2]], 30
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ult i64 [[INDVARS_IV_2]], 195
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP16]])
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds <225 x double>, ptr [[A]], i64 0, i64 [[TMP15]]
 ; CHECK-NEXT:    [[MATRIXEXT_US_2:%.*]] = load double, ptr [[TMP17]], align 8
@@ -150,9 +147,9 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea
 ; CHECK-NEXT:    [[MATRIXEXT11_US_2:%.*]] = load double, ptr [[TMP18]], align 8
 ; CHECK-NEXT:    [[SUB_US_2:%.*]] = fsub double [[MATRIXEXT11_US_2]], [[MUL_US_2]]
 ; CHECK-NEXT:    store double [[SUB_US_2]], ptr [[TMP18]], align 8
-; CHECK-NEXT:    [[INC_US_2]] = add nuw nsw i32 [[K_011_US_2]], 1
-; CHECK-NEXT:    [[CMP2_US_2:%.*]] = icmp ult i32 [[INC_US_2]], [[I]]
-; CHECK-NEXT:    br i1 [[CMP2_US_2]], label [[FOR_BODY4_US_2]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_2:%.*]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT_2]] = add nuw nsw i64 [[INDVARS_IV_2]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT_2:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_2]], [[CONV6]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT_2]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_2:%.*]], label [[FOR_BODY4_US_2]]
 ; CHECK:       for.cond1.for.cond.cleanup3_crit_edge.us.2:
 ; CHECK-NEXT:    [[TMP19:%.*]] = add nuw nsw i64 [[CONV6]], 45
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp ult i32 [[I]], 180
@@ -160,10 +157,9 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds <225 x double>, ptr [[B]], i64 0, i64 [[TMP19]]
 ; CHECK-NEXT:    br label [[FOR_BODY4_US_3:%.*]]
 ; CHECK:       for.body4.us.3:
-; CHECK-NEXT:    [[K_011_US_3:%.*]] = phi i32 [ 0, [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_2]] ], [ [[INC_US_3:%.*]], [[FOR_BODY4_US_3]] ]
-; CHECK-NEXT:    [[CONV_US_3:%.*]] = zext i32 [[K_011_US_3]] to i64
-; CHECK-NEXT:    [[TMP22:%.*]] = add nuw nsw i64 [[CONV_US_3]], 45
-; CHECK-NEXT:    [[TMP23:%.*]] = icmp ult i32 [[K_011_US_3]], 180
+; CHECK-NEXT:    [[INDVARS_IV_3:%.*]] = phi i64 [ 0, [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_2]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_BODY4_US_3]] ]
+; CHECK-NEXT:    [[TMP22:%.*]] = add nuw nsw i64 [[INDVARS_IV_3]], 45
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp ult i64 [[INDVARS_IV_3]], 180
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP23]])
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds <225 x double>, ptr [[A]], i64 0, i64 [[TMP22]]
 ; CHECK-NEXT:    [[MATRIXEXT_US_3:%.*]] = load double, ptr [[TMP24]], align 8
@@ -173,9 +169,9 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea
 ; CHECK-NEXT:    [[MATRIXEXT11_US_3:%.*]] = load double, ptr [[TMP25]], align 8
 ; CHECK-NEXT:    [[SUB_US_3:%.*]] = fsub double [[MATRIXEXT11_US_3]], [[MUL_US_3]]
 ; CHECK-NEXT:    store double [[SUB_US_3]], ptr [[TMP25]], align 8
-; CHECK-NEXT:    [[INC_US_3]] = add nuw nsw i32 [[K_011_US_3]], 1
-; CHECK-NEXT:    [[CMP2_US_3:%.*]] = icmp ult i32 [[INC_US_3]], [[I]]
-; CHECK-NEXT:    br i1 [[CMP2_US_3]], label [[FOR_BODY4_US_3]], label [[FOR_COND_CLEANUP]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV_3]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_3]], [[CONV6]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT_3]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY4_US_3]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll
index 8d41d9a486801f3..a077239e5ffc7a3 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll
@@ -16,19 +16,20 @@ define i64 @sum_2_at_with_int_conversion(ptr %A, ptr %B, i64 %N) {
 ; CHECK-NEXT:    [[START_INT_I:%.*]] = ptrtoint ptr [[START_I]] to i64
 ; CHECK-NEXT:    [[END_INT_I:%.*]] = ptrtoint ptr [[END_I]] to i64
 ; CHECK-NEXT:    [[SUB_I:%.*]] = sub i64 [[END_INT_I]], [[START_INT_I]]
+; CHECK-NEXT:    [[SMAX:%.*]] = tail call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 0)
 ; CHECK-NEXT:    [[GEP_END_I2:%.*]] = getelementptr [[VEC]], ptr [[B:%.*]], i64 0, i32 1
 ; CHECK-NEXT:    [[START_I1_PEEL:%.*]] = load ptr, ptr [[B]], align 8
 ; CHECK-NEXT:    [[END_I3_PEEL:%.*]] = load ptr, ptr [[GEP_END_I2]], align 8
 ; CHECK-NEXT:    [[START_INT_I4_PEEL:%.*]] = ptrtoint ptr [[START_I1_PEEL]] to i64
 ; CHECK-NEXT:    [[END_INT_I5_PEEL:%.*]] = ptrtoint ptr [[END_I3_PEEL]] to i64
 ; CHECK-NEXT:    [[SUB_I6_PEEL:%.*]] = sub i64 [[END_INT_I5_PEEL]], [[START_INT_I4_PEEL]]
-; CHECK-NEXT:    [[LV_I_PEEL:%.*]] = load i64, ptr [[START_I]], align 4
-; CHECK-NEXT:    [[LV_I9_PEEL:%.*]] = load i64, ptr [[START_I1_PEEL]], align 4
+; CHECK-NEXT:    [[LV_I_PEEL:%.*]] = load i64, ptr [[START_I]], align 8
+; CHECK-NEXT:    [[LV_I9_PEEL:%.*]] = load i64, ptr [[START_I1_PEEL]], align 8
 ; CHECK-NEXT:    [[SUM_NEXT_PEEL:%.*]] = add i64 [[LV_I_PEEL]], [[LV_I9_PEEL]]
-; CHECK-NEXT:    [[C_PEEL:%.*]] = icmp sgt i64 [[N:%.*]], 0
-; CHECK-NEXT:    br i1 [[C_PEEL]], label [[LOOP_PREHEADER:%.*]], label [[EXIT:%.*]]
+; CHECK-NEXT:    [[EXITCOND_PEEL_NOT:%.*]] = icmp slt i64 [[N]], 1
+; CHECK-NEXT:    br i1 [[EXITCOND_PEEL_NOT]], label [[EXIT:%.*]], label [[LOOP_PREHEADER:%.*]]
 ; CHECK:       loop.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], -1
+; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i64 [[SMAX]], -1
 ; CHECK-NEXT:    [[UMIN:%.*]] = tail call i64 @llvm.umin.i64(i64 [[SUB_I6_PEEL]], i64 [[TMP0]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = freeze i64 [[UMIN]]
 ; CHECK-NEXT:    [[UMIN15:%.*]] = tail call i64 @llvm.umin.i64(i64 [[TMP1]], i64 [[SUB_I]])
@@ -49,13 +50,13 @@ define i64 @sum_2_at_with_int_conversion(ptr %A, ptr %B, i64 %N) {
 ; CHECK-NEXT:    [[VEC_PHI16:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = or i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[START_I]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[TMP6]], i64 2
-; CHECK-NEXT:    [[WIDE_LOAD17:%.*]] = load <2 x i64>, ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD17:%.*]] = load <2 x i64>, ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[START_I1_PEEL]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[WIDE_LOAD18:%.*]] = load <2 x i64>, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD18:%.*]] = load <2 x i64>, ptr [[TMP8]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[TMP8]], i64 2
-; CHECK-NEXT:    [[WIDE_LOAD19:%.*]] = load <2 x i64>, ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD19:%.*]] = load <2 x i64>, ptr [[TMP9]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = add <2 x i64> [[WIDE_LOAD17]], [[VEC_PHI16]]
 ; CHECK-NEXT:    [[TMP12]] = add <2 x i64> [[TMP10]], [[WIDE_LOAD18]]
@@ -87,14 +88,14 @@ define i64 @sum_2_at_with_int_conversion(ptr %A, ptr %B, i64 %N) {
 ; CHECK-NEXT:    unreachable
 ; CHECK:       at_with_int_conversion.exit11:
 ; CHECK-NEXT:    [[GEP_IDX_I:%.*]] = getelementptr i64, ptr [[START_I]], i64 [[IV]]
-; CHECK-NEXT:    [[LV_I:%.*]] = load i64, ptr [[GEP_IDX_I]], align 4
+; CHECK-NEXT:    [[LV_I:%.*]] = load i64, ptr [[GEP_IDX_I]], align 8
 ; CHECK-NEXT:    [[GEP_IDX_I8:%.*]] = getelementptr i64, ptr [[START_I1_PEEL]], i64 [[IV]]
-; CHECK-NEXT:    [[LV_I9:%.*]] = load i64, ptr [[GEP_IDX_I8]], align 4
+; CHECK-NEXT:    [[LV_I9:%.*]] = load i64, ptr [[GEP_IDX_I8]], align 8
 ; CHECK-NEXT:    [[ADD:%.*]] = add i64 [[LV_I]], [[SUM]]
 ; CHECK-NEXT:    [[SUM_NEXT]] = add i64 [[ADD]], [[LV_I9]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[C:%.*]] = icmp slt i64 [[IV]], [[N]]
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV]], [[SMAX]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT_PEEL]], [[AT_WITH_INT_CONVERSION_EXIT11_PEEL:%.*]] ], [ [[SUM_NEXT]], [[AT_WITH_INT_CONVERSION_EXIT11]] ]
 ; CHECK-NEXT:    ret i64 [[SUM_NEXT_LCSSA]]
@@ -127,8 +128,9 @@ define i64 @sum_3_at_with_int_conversion(ptr %A, ptr %B, ptr %C, i64 %N) {
 ; CHECK-NEXT:    [[END_INT_I:%.*]] = ptrtoint ptr [[END_I]] to i64
 ; CHECK-NEXT:    [[SUB_I:%.*]] = sub i64 [[END_INT_I]], [[START_INT_I]]
 ; CHECK-NEXT:    [[GEP_END_I13:%.*]] = getelementptr [[VEC]], ptr [[C:%.*]], i64 0, i32 1
+; CHECK-NEXT:    [[SMAX:%.*]] = tail call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 0)
 ; CHECK-NEXT:    [[GEP_END_I2:%.*]] = getelementptr [[VEC]], ptr [[B:%.*]], i64 0, i32 1
-; CHECK-NEXT:    [[LV_I_PEEL:%.*]] = load i64, ptr [[START_I]], align 4
+; CHECK-NEXT:    [[LV_I_PEEL:%.*]] = load i64, ptr [[START_I]], align 8
 ; CHECK-NEXT:    [[START_I1_PEEL:%.*]] = load ptr, ptr [[B]], align 8
 ; CHECK-NEXT:    [[END_I3_PEEL:%.*]] = load ptr, ptr [[GEP_END_I2]], align 8
 ; CHECK-NEXT:    [[START_INT_I4_PEEL:%.*]] = ptrtoint ptr [[START_I1_PEEL]] to i64
@@ -140,14 +142,14 @@ define i64 @sum_3_at_with_int_conversion(ptr %A, ptr %B, ptr %C, i64 %N) {
 ; CHECK-NEXT:    [[START_INT_I15_PEEL:%.*]] = ptrtoint ptr [[START_I12_PEEL]] to i64
 ; CHECK-NEXT:    [[END_INT_I16_PEEL:%.*]] = ptrtoint ptr [[END_I14_PEEL]] to i64
 ; CHECK-NEXT:    [[SUB_I17_PEEL:%.*]] = sub i64 [[END_INT_I16_PEEL]], [[START_INT_I15_PEEL]]
-; CHECK-NEXT:    [[LV_I9_PEEL:%.*]] = load i64, ptr [[START_I1_PEEL]], align 4
-; CHECK-NEXT:    [[LV_I20_PEEL:%.*]] = load i64, ptr [[START_I12_PEEL]], align 4
+; CHECK-NEXT:    [[LV_I9_PEEL:%.*]] = load i64, ptr [[START_I1_PEEL]], align 8
+; CHECK-NEXT:    [[LV_I20_PEEL:%.*]] = load i64, ptr [[START_I12_PEEL]], align 8
 ; CHECK-NEXT:    [[ADD_2_PEEL:%.*]] = add i64 [[LV_I_PEEL]], [[LV_I9_PEEL]]
 ; CHECK-NEXT:    [[SUM_NEXT_PEEL:%.*]] = add i64 [[ADD_2_PEEL]], [[LV_I20_PEEL]]
-; CHECK-NEXT:    [[COND_PEEL:%.*]] = icmp sgt i64 [[N:%.*]], 0
-; CHECK-NEXT:    br i1 [[COND_PEEL]], label [[LOOP_PREHEADER:%.*]], label [[EXIT:%.*]]
+; CHECK-NEXT:    [[EXITCOND_PEEL_NOT:%.*]] = icmp slt i64 [[N]], 1
+; CHECK-NEXT:    br i1 [[EXITCOND_PEEL_NOT]], label [[EXIT:%.*]], label [[LOOP_PREHEADER:%.*]]
 ; CHECK:       loop.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], -1
+; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i64 [[SMAX]], -1
 ; CHECK-NEXT:    [[UMIN:%.*]] = tail call i64 @llvm.umin.i64(i64 [[SUB_I17_PEEL]], i64 [[TMP0]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = freeze i64 [[UMIN]]
 ; CHECK-NEXT:    [[UMIN26:%.*]] = tail call i64 @llvm.umin.i64(i64 [[TMP1]], i64 [[SUB_I6_PEEL]])
@@ -169,17 +171,17 @@ define i64 @sum_3_at_with_int_conversion(ptr %A, ptr %B, ptr %C, i64 %N) {
 ; CHECK-NEXT:    [[VEC_PHI28:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = or i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[START_I]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[TMP6]], i64 2
-; CHECK-NEXT:    [[WIDE_LOAD29:%.*]] = load <2 x i64>, ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD29:%.*]] = load <2 x i64>, ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[START_I1_PEEL]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[WIDE_LOAD30:%.*]] = load <2 x i64>, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD30:%.*]] = load <2 x i64>, ptr [[TMP8]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[TMP8]], i64 2
-; CHECK-NEXT:    [[WIDE_LOAD31:%.*]] = load <2 x i64>, ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD31:%.*]] = load <2 x i64>, ptr [[TMP9]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[START_I12_PEEL]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[WIDE_LOAD32:%.*]] = load <2 x i64>, ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD32:%.*]] = load <2 x i64>, ptr [[TMP10]], align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i64, ptr [[TMP10]], i64 2
-; CHECK-NEXT:    [[WIDE_LOAD33:%.*]] = load <2 x i64>, ptr [[TMP11]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD33:%.*]] = load <2 x i64>, ptr [[TMP11]], align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = add <2 x i64> [[WIDE_LOAD29]], [[VEC_PHI28]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = add <2 x i64> [[TMP12]], [[WIDE_LOAD30]]
@@ -207,7 +209,7 @@ define i64 @sum_3_at_with_int_conversion(ptr %A, ptr %B, ptr %C, i64 %N) {
 ; CHECK-NEXT:    unreachable
 ; CHECK:       at_with_int_conversion.exit:
 ; CHECK-NEXT:    [[GEP_IDX_I:%.*]] = getelementptr i64, ptr [[START_I]], i64 [[IV]]
-; CHECK-NEXT:    [[LV_I:%.*]] = load i64, ptr [[GEP_IDX_I]], align 4
+; CHECK-NEXT:    [[LV_I:%.*]] = load i64, ptr [[GEP_IDX_I]], align 8
 ; CHECK-NEXT:    [[INRANGE_I7:%.*]] = icmp ult i64 [[SUB_I6_PEEL]], [[IV]]
 ; CHECK-NEXT:    br i1 [[INRANGE_I7]], label [[ERROR_I10:%.*]], label [[AT_WITH_INT_CONVERSION_EXIT11:%.*]]
 ; CHECK:       error.i10:
@@ -221,15 +223,15 @@ define i64 @sum_3_at_with_int_conversion(ptr %A, ptr %B, ptr %C, i64 %N) {
 ; CHECK-NEXT:    unreachable
 ; CHECK:       at_with_int_conversion.exit22:
 ; CHECK-NEXT:    [[GEP_IDX_I8:%.*]] = getelementptr i64, ptr [[START_I1_PEEL]], i64 [[IV]]
-; CHECK-NEXT:    [[LV_I9:%.*]] = load i64, ptr [[GEP_IDX_I8]], align 4
+; CHECK-NEXT:    [[LV_I9:%.*]] = load i64, ptr [[GEP_IDX_I8]], align 8
 ; CHECK-NEXT:    [[GEP_IDX_I19:%.*]] = getelementptr i64, ptr [[START_I12_PEEL]], i64 [[IV]]
-; CHECK-NEXT:    [[LV_I20:%.*]] = load i64, ptr [[GEP_IDX_I19]], align 4
+; CHECK-NEXT:    [[LV_I20:%.*]] = load i64, ptr [[GEP_IDX_I19]], align 8
 ; CHECK-NEXT:    [[ADD_1:%.*]] = add i64 [[LV_I]], [[SUM]]
 ; CHECK-NEXT:    [[ADD_2:%.*]] = add i64 [[ADD_1]], [[LV_I9]]
 ; CHECK-NEXT:    [[SUM_NEXT]] = add i64 [[ADD_2]], [[LV_I20]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[IV]], [[N]]
-; CHECK-NEXT:    br i1 [[COND]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV]], [[SMAX]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT_PEEL]], [[AT_WITH_INT_CONVERSION_EXIT22_PEEL:%.*]] ], [ [[SUM_NEXT]], [[AT_WITH_INT_CONVERSION_EXIT22]] ]
 ; CHECK-NEXT:    ret i64 [[SUM_NEXT_LCSSA]]
@@ -267,7 +269,7 @@ define i64 @at_with_int_conversion(ptr %ptr, i64 %idx) {
 ; CHECK-NEXT:    br i1 [[INRANGE]], label [[ERROR:%.*]], label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[GEP_IDX:%.*]] = getelementptr i64, ptr [[START]], i64 [[IDX]]
-; CHECK-NEXT:    [[LV:%.*]] = load i64, ptr [[GEP_IDX]], align 4
+; CHECK-NEXT:    [[LV:%.*]] = load i64, ptr [[GEP_IDX]], align 8
 ; CHECK-NEXT:    ret i64 [[LV]]
 ; CHECK:       error:
 ; CHECK-NEXT:    tail call void @error()

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-add-i64.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-add-i64.ll
index 47b98d1a6f73303..d67fdc1cd6aa0e3 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-add-i64.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-add-i64.ll
@@ -183,9 +183,9 @@ entry:
 define i64 @red_ld_2xi64(ptr %ptr) {
 ; CHECK-LABEL: @red_ld_2xi64(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[LD0:%.*]] = load i64, ptr [[PTR:%.*]], align 4
+; CHECK-NEXT:    [[LD0:%.*]] = load i64, ptr [[PTR:%.*]], align 8
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 1
-; CHECK-NEXT:    [[LD1:%.*]] = load i64, ptr [[GEP]], align 4
+; CHECK-NEXT:    [[LD1:%.*]] = load i64, ptr [[GEP]], align 8
 ; CHECK-NEXT:    [[ADD_1:%.*]] = add nuw nsw i64 [[LD0]], [[LD1]]
 ; CHECK-NEXT:    ret i64 [[ADD_1]]
 ;
@@ -200,7 +200,7 @@ entry:
 define i64 @red_ld_4xi64(ptr %ptr) {
 ; CHECK-LABEL: @red_ld_4xi64(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr [[PTR:%.*]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr [[PTR:%.*]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP0]])
 ; CHECK-NEXT:    ret i64 [[TMP1]]
 ;
@@ -221,7 +221,7 @@ entry:
 define i64 @red_ld_8xi64(ptr %ptr) {
 ; CHECK-LABEL: @red_ld_8xi64(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[PTR:%.*]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[PTR:%.*]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP0]])
 ; CHECK-NEXT:    ret i64 [[TMP1]]
 ;
@@ -254,7 +254,7 @@ entry:
 define i64 @red_ld_16xi64(ptr %ptr) {
 ; CHECK-LABEL: @red_ld_16xi64(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i64>, ptr [[PTR:%.*]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i64>, ptr [[PTR:%.*]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> [[TMP0]])
 ; CHECK-NEXT:    ret i64 [[TMP1]]
 ;

diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/splat-loads.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/splat-loads.ll
index 111ca8abf8122fb..afaf6b98e50812e 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/splat-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/splat-loads.ll
@@ -101,7 +101,7 @@ define void @splat_loads_i64(ptr %array1, ptr %array2, ptr %ptrA, ptr %ptrB) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = or <2 x i64> [[TMP0]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = add <2 x i64> [[TMP3]], [[TMP6]]
-; CHECK-NEXT:    store <2 x i64> [[TMP7]], ptr [[ARRAY1]], align 4
+; CHECK-NEXT:    store <2 x i64> [[TMP7]], ptr [[ARRAY1]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/control-dependence.ll b/llvm/test/Transforms/SLPVectorizer/X86/control-dependence.ll
index d1dd57fac495879..69cfdd44ff6e7ba 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/control-dependence.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/control-dependence.ll
@@ -8,10 +8,10 @@ declare i64 @may_throw() willreturn
 ; Base case with no interesting control dependencies
 define void @test_no_control(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: @test_no_control(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    store <2 x i64> [[TMP3]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store <2 x i64> [[TMP3]], ptr [[B:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %v1 = load i64, ptr %a
@@ -32,13 +32,13 @@ define void @test_no_control(ptr %a, ptr %b, ptr %c) {
 
 define void @test1(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: @test1(
-; CHECK-NEXT:    [[C1:%.*]] = load i64, ptr [[C:%.*]], align 4
+; CHECK-NEXT:    [[C1:%.*]] = load i64, ptr [[C:%.*]], align 8
 ; CHECK-NEXT:    [[C2:%.*]] = call i64 @may_inf_loop_ro()
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[C2]], i32 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %v1 = load i64, ptr %a
@@ -58,13 +58,13 @@ define void @test1(ptr %a, ptr %b, ptr %c) {
 
 define void @test2(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: @test2(
-; CHECK-NEXT:    [[C1:%.*]] = load i64, ptr [[C:%.*]], align 4
+; CHECK-NEXT:    [[C1:%.*]] = load i64, ptr [[C:%.*]], align 8
 ; CHECK-NEXT:    [[C2:%.*]] = call i64 @may_inf_loop_ro()
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[C2]], i32 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %c1 = load i64, ptr %c
@@ -85,13 +85,13 @@ define void @test2(ptr %a, ptr %b, ptr %c) {
 
 define void @test3(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: @test3(
-; CHECK-NEXT:    [[C1:%.*]] = load i64, ptr [[C:%.*]], align 4
+; CHECK-NEXT:    [[C1:%.*]] = load i64, ptr [[C:%.*]], align 8
 ; CHECK-NEXT:    [[C2:%.*]] = call i64 @may_inf_loop_ro()
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[C2]], i32 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %v1 = load i64, ptr %a
@@ -111,13 +111,13 @@ define void @test3(ptr %a, ptr %b, ptr %c) {
 
 define void @test4(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: @test4(
-; CHECK-NEXT:    [[C1:%.*]] = load i64, ptr [[C:%.*]], align 4
+; CHECK-NEXT:    [[C1:%.*]] = load i64, ptr [[C:%.*]], align 8
 ; CHECK-NEXT:    [[C2:%.*]] = call i64 @may_inf_loop_ro()
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[C2]], i32 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %v1 = load i64, ptr %a
@@ -138,12 +138,12 @@ define void @test4(ptr %a, ptr %b, ptr %c) {
 define void @test5(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: @test5(
 ; CHECK-NEXT:    [[C2:%.*]] = call i64 @may_inf_loop_ro()
-; CHECK-NEXT:    [[C1:%.*]] = load i64, ptr [[C:%.*]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[C1:%.*]] = load i64, ptr [[C:%.*]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[C2]], i32 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %a2 = getelementptr i64, ptr %a, i32 1
@@ -164,10 +164,10 @@ define void @test5(ptr %a, ptr %b, ptr %c) {
 define void @test6(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: @test6(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @may_inf_loop_ro()
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %v1 = load i64, ptr %a
@@ -196,15 +196,15 @@ define void @test6(ptr %a, ptr %b, ptr %c) {
 define void @test7(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: @test7(
 ; CHECK-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[A:%.*]], i32 1
-; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[A]], align 4
-; CHECK-NEXT:    store i64 0, ptr [[A]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[A]], align 8
+; CHECK-NEXT:    store i64 0, ptr [[A]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @may_inf_loop_ro()
-; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[A2]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4
+; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[A2]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[V2]], i32 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP2]]
-; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %v1 = load i64, ptr %a
@@ -229,15 +229,15 @@ define void @test7(ptr %a, ptr %b, ptr %c) {
 define void @test8(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: @test8(
 ; CHECK-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[A:%.*]], i32 1
-; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[A]], align 4
-; CHECK-NEXT:    store i64 0, ptr [[A]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[A]], align 8
+; CHECK-NEXT:    store i64 0, ptr [[A]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @may_throw() #[[ATTR4:[0-9]+]]
-; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[A2]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4
+; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[A2]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[V2]], i32 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP2]]
-; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %v1 = load i64, ptr %a
@@ -262,15 +262,15 @@ define void @test8(ptr %a, ptr %b, ptr %c) {
 define void @test9(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: @test9(
 ; CHECK-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[A:%.*]], i32 1
-; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[A]], align 4
-; CHECK-NEXT:    store i64 0, ptr [[A]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[A]], align 8
+; CHECK-NEXT:    store i64 0, ptr [[A]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @may_throw()
-; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[A2]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4
+; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[A2]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[V2]], i32 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP2]]
-; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %v1 = load i64, ptr %a
@@ -294,18 +294,18 @@ define void @test9(ptr %a, ptr %b, ptr %c) {
 ; A variant of test7 which shows the same problem with a non-load instruction
 define void @test10(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: @test10(
-; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[A:%.*]], align 8
 ; CHECK-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[A]], i32 1
-; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[A2]], align 4
+; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[A2]], align 8
 ; CHECK-NEXT:    [[U1:%.*]] = udiv i64 200, [[V1]]
-; CHECK-NEXT:    store i64 [[U1]], ptr [[A]], align 4
+; CHECK-NEXT:    store i64 [[U1]], ptr [[A]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @may_inf_loop_ro()
 ; CHECK-NEXT:    [[U2:%.*]] = udiv i64 200, [[V2]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[U1]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[U2]], i32 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP2]]
-; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %v1 = load i64, ptr %a
@@ -334,14 +334,14 @@ define void @test10(ptr %a, ptr %b, ptr %c) {
 define void @test11(i64 %x, i64 %y, ptr %b, ptr %c) {
 ; CHECK-LABEL: @test11(
 ; CHECK-NEXT:    [[U1:%.*]] = udiv i64 200, [[X:%.*]]
-; CHECK-NEXT:    store i64 [[U1]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store i64 [[U1]], ptr [[B:%.*]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @may_inf_loop_ro()
 ; CHECK-NEXT:    [[U2:%.*]] = udiv i64 200, [[Y:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[U1]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[U2]], i32 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP2]]
-; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[B]], align 4
+; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[B]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %u1 = udiv i64 200, %x

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll b/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll
index e2adeb911670a56..a2c2e41d81b4264 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll
@@ -52,14 +52,17 @@ define void @test(ptr %r, ptr %p, ptr %q) #0 {
 
 define void @test2(i64* %a, i64* %b) {
 ; CHECK-LABEL: @test2(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[A:%.*]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x ptr> [[TMP1]], ptr [[B:%.*]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i64, <2 x ptr> [[TMP2]], <2 x i64> <i64 1, i64 3>
-; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint <2 x ptr> [[TMP3]] to <2 x i64>
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = add <2 x i64> [[TMP4]], [[TMP6]]
-; CHECK-NEXT:    store <2 x i64> [[TMP7]], ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[A1:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 1
+; CHECK-NEXT:    [[A2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 2
+; CHECK-NEXT:    [[I1:%.*]] = ptrtoint ptr [[A1]] to i64
+; CHECK-NEXT:    [[B3:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 3
+; CHECK-NEXT:    [[I2:%.*]] = ptrtoint ptr [[B3]] to i64
+; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[A1]], align 8
+; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[A2]], align 8
+; CHECK-NEXT:    [[ADD1:%.*]] = add i64 [[I1]], [[V1]]
+; CHECK-NEXT:    [[ADD2:%.*]] = add i64 [[I2]], [[V2]]
+; CHECK-NEXT:    store i64 [[ADD1]], ptr [[A1]], align 8
+; CHECK-NEXT:    store i64 [[ADD2]], ptr [[A2]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %a1 = getelementptr inbounds i64, i64* %a, i64 1

diff  --git a/llvm/test/Transforms/SafeStack/X86/setjmp2.ll b/llvm/test/Transforms/SafeStack/X86/setjmp2.ll
index 7fbd4506bea78c2..d5bd7c67109a161 100644
--- a/llvm/test/Transforms/SafeStack/X86/setjmp2.ll
+++ b/llvm/test/Transforms/SafeStack/X86/setjmp2.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt -safe-stack -S -mtriple=i386-pc-linux-gnu < %s -o - | FileCheck %s
-; RUN: opt -safe-stack -S -mtriple=x86_64-pc-linux-gnu < %s -o - | FileCheck %s
+; RUN: opt -safe-stack -S -mtriple=i386-pc-linux-gnu < %s -o - | FileCheck %s --check-prefix=I386
+; RUN: opt -safe-stack -S -mtriple=x86_64-pc-linux-gnu < %s -o - | FileCheck %s  --check-prefix=X86-64
 
 %struct.__jmp_buf_tag = type { [8 x i64], i32, %struct.__sigset_t }
 %struct.__sigset_t = type { [16 x i64] }
@@ -11,27 +11,48 @@
 ; setjmp/longjmp test with dynamically sized array.
 ; Requires protector.
 define i32 @foo(i32 %size) nounwind uwtable safestack {
-; CHECK-LABEL: define i32 @foo(
-; CHECK-SAME: i32 [[SIZE:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[UNSAFE_STACK_PTR:%.*]] = load ptr, ptr @__safestack_unsafe_stack_ptr, align 8
-; CHECK-NEXT:    [[UNSAFE_STACK_DYNAMIC_PTR:%.*]] = alloca ptr, align 8
-; CHECK-NEXT:    store ptr [[UNSAFE_STACK_PTR]], ptr [[UNSAFE_STACK_DYNAMIC_PTR]], align 8
-; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[SIZE]] to i64
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr @__safestack_unsafe_stack_ptr, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP3]], [[TMP1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], -16
-; CHECK-NEXT:    [[A:%.*]] = inttoptr i64 [[TMP5]] to ptr
-; CHECK-NEXT:    store ptr [[A]], ptr @__safestack_unsafe_stack_ptr, align 8
-; CHECK-NEXT:    store ptr [[A]], ptr [[UNSAFE_STACK_DYNAMIC_PTR]], align 8
-; CHECK-NEXT:    [[CALL:%.*]] = call i32 @_setjmp(ptr @buf) #[[ATTR1:[0-9]+]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[UNSAFE_STACK_DYNAMIC_PTR]], align 8
-; CHECK-NEXT:    store ptr [[TMP6]], ptr @__safestack_unsafe_stack_ptr, align 8
-; CHECK-NEXT:    call void @funcall(ptr [[A]])
-; CHECK-NEXT:    store ptr [[UNSAFE_STACK_PTR]], ptr @__safestack_unsafe_stack_ptr, align 8
-; CHECK-NEXT:    ret i32 0
+; I386-LABEL: define i32 @foo(
+; I386-SAME: i32 [[SIZE:%.*]]) #[[ATTR0:[0-9]+]] {
+; I386-NEXT:  entry:
+; I386-NEXT:    [[UNSAFE_STACK_PTR:%.*]] = load ptr, ptr @__safestack_unsafe_stack_ptr, align 4
+; I386-NEXT:    [[UNSAFE_STACK_DYNAMIC_PTR:%.*]] = alloca ptr, align 4
+; I386-NEXT:    store ptr [[UNSAFE_STACK_PTR]], ptr [[UNSAFE_STACK_DYNAMIC_PTR]], align 4
+; I386-NEXT:    [[TMP0:%.*]] = mul i32 [[SIZE]], 4
+; I386-NEXT:    [[TMP1:%.*]] = load ptr, ptr @__safestack_unsafe_stack_ptr, align 4
+; I386-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i32
+; I386-NEXT:    [[TMP3:%.*]] = sub i32 [[TMP2]], [[TMP0]]
+; I386-NEXT:    [[TMP4:%.*]] = and i32 [[TMP3]], -16
+; I386-NEXT:    [[A:%.*]] = inttoptr i32 [[TMP4]] to ptr
+; I386-NEXT:    store ptr [[A]], ptr @__safestack_unsafe_stack_ptr, align 4
+; I386-NEXT:    store ptr [[A]], ptr [[UNSAFE_STACK_DYNAMIC_PTR]], align 4
+; I386-NEXT:    [[CALL:%.*]] = call i32 @_setjmp(ptr @buf) #[[ATTR1:[0-9]+]]
+; I386-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[UNSAFE_STACK_DYNAMIC_PTR]], align 4
+; I386-NEXT:    store ptr [[TMP5]], ptr @__safestack_unsafe_stack_ptr, align 4
+; I386-NEXT:    call void @funcall(ptr [[A]])
+; I386-NEXT:    store ptr [[UNSAFE_STACK_PTR]], ptr @__safestack_unsafe_stack_ptr, align 4
+; I386-NEXT:    ret i32 0
+;
+; X86-64-LABEL: define i32 @foo(
+; X86-64-SAME: i32 [[SIZE:%.*]]) #[[ATTR0:[0-9]+]] {
+; X86-64-NEXT:  entry:
+; X86-64-NEXT:    [[UNSAFE_STACK_PTR:%.*]] = load ptr, ptr @__safestack_unsafe_stack_ptr, align 8
+; X86-64-NEXT:    [[UNSAFE_STACK_DYNAMIC_PTR:%.*]] = alloca ptr, align 8
+; X86-64-NEXT:    store ptr [[UNSAFE_STACK_PTR]], ptr [[UNSAFE_STACK_DYNAMIC_PTR]], align 8
+; X86-64-NEXT:    [[TMP0:%.*]] = zext i32 [[SIZE]] to i64
+; X86-64-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; X86-64-NEXT:    [[TMP2:%.*]] = load ptr, ptr @__safestack_unsafe_stack_ptr, align 8
+; X86-64-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP2]] to i64
+; X86-64-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP3]], [[TMP1]]
+; X86-64-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], -16
+; X86-64-NEXT:    [[A:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; X86-64-NEXT:    store ptr [[A]], ptr @__safestack_unsafe_stack_ptr, align 8
+; X86-64-NEXT:    store ptr [[A]], ptr [[UNSAFE_STACK_DYNAMIC_PTR]], align 8
+; X86-64-NEXT:    [[CALL:%.*]] = call i32 @_setjmp(ptr @buf) #[[ATTR1:[0-9]+]]
+; X86-64-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[UNSAFE_STACK_DYNAMIC_PTR]], align 8
+; X86-64-NEXT:    store ptr [[TMP6]], ptr @__safestack_unsafe_stack_ptr, align 8
+; X86-64-NEXT:    call void @funcall(ptr [[A]])
+; X86-64-NEXT:    store ptr [[UNSAFE_STACK_PTR]], ptr @__safestack_unsafe_stack_ptr, align 8
+; X86-64-NEXT:    ret i32 0
 ;
 entry:
 

diff  --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn.ll
index 237c2b42b19019c..6ef8a38dfd45957 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn.ll
@@ -22,20 +22,20 @@ define void @sum_of_array(i32 %x, i32 %y, ptr nocapture %output) {
 ; IR-NEXT:  .preheader:
 ; IR-NEXT:    [[I:%.*]] = sext i32 [[Y]] to i64
 ; IR-NEXT:    [[I1:%.*]] = sext i32 [[X]] to i64
-; IR-NEXT:    [[I2:%.*]] = getelementptr [32 x [32 x float]], ptr addrspace(3) @array, i64 0, i64 [[I1]], i64 [[I]]
+; IR-NEXT:    [[I2:%.*]] = getelementptr [32 x [32 x float]], ptr addrspace(3) @array, i32 0, i32 [[X]], i32 [[Y]]
 ; IR-NEXT:    [[I3:%.*]] = addrspacecast ptr addrspace(3) [[I2]] to ptr
 ; IR-NEXT:    [[I4:%.*]] = load float, ptr [[I3]], align 4
 ; IR-NEXT:    [[I5:%.*]] = fadd float [[I4]], 0.000000e+00
-; IR-NEXT:    [[I82:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i64 1
-; IR-NEXT:    [[I9:%.*]] = addrspacecast ptr addrspace(3) [[I82]] to ptr
+; IR-NEXT:    [[I87:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i32 1
+; IR-NEXT:    [[I9:%.*]] = addrspacecast ptr addrspace(3) [[I87]] to ptr
 ; IR-NEXT:    [[I10:%.*]] = load float, ptr [[I9]], align 4
 ; IR-NEXT:    [[I11:%.*]] = fadd float [[I5]], [[I10]]
-; IR-NEXT:    [[I144:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i64 32
-; IR-NEXT:    [[I15:%.*]] = addrspacecast ptr addrspace(3) [[I144]] to ptr
+; IR-NEXT:    [[I1412:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i32 32
+; IR-NEXT:    [[I15:%.*]] = addrspacecast ptr addrspace(3) [[I1412]] to ptr
 ; IR-NEXT:    [[I16:%.*]] = load float, ptr [[I15]], align 4
 ; IR-NEXT:    [[I17:%.*]] = fadd float [[I11]], [[I16]]
-; IR-NEXT:    [[I187:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i64 33
-; IR-NEXT:    [[I19:%.*]] = addrspacecast ptr addrspace(3) [[I187]] to ptr
+; IR-NEXT:    [[I1818:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i32 33
+; IR-NEXT:    [[I19:%.*]] = addrspacecast ptr addrspace(3) [[I1818]] to ptr
 ; IR-NEXT:    [[I20:%.*]] = load float, ptr [[I19]], align 4
 ; IR-NEXT:    [[I21:%.*]] = fadd float [[I17]], [[I20]]
 ; IR-NEXT:    store float [[I21]], ptr [[OUTPUT]], align 4
@@ -84,20 +84,20 @@ define void @sum_of_array2(i32 %x, i32 %y, ptr nocapture %output) {
 ; IR-NEXT:  .preheader:
 ; IR-NEXT:    [[I:%.*]] = sext i32 [[Y]] to i64
 ; IR-NEXT:    [[I1:%.*]] = sext i32 [[X]] to i64
-; IR-NEXT:    [[I2:%.*]] = getelementptr [32 x [32 x float]], ptr addrspace(3) @array, i64 0, i64 [[I1]], i64 [[I]]
+; IR-NEXT:    [[I2:%.*]] = getelementptr [32 x [32 x float]], ptr addrspace(3) @array, i32 0, i32 [[X]], i32 [[Y]]
 ; IR-NEXT:    [[I3:%.*]] = addrspacecast ptr addrspace(3) [[I2]] to ptr
 ; IR-NEXT:    [[I4:%.*]] = load float, ptr [[I3]], align 4
 ; IR-NEXT:    [[I5:%.*]] = fadd float [[I4]], 0.000000e+00
-; IR-NEXT:    [[I72:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i64 1
-; IR-NEXT:    [[I8:%.*]] = addrspacecast ptr addrspace(3) [[I72]] to ptr
+; IR-NEXT:    [[I77:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i32 1
+; IR-NEXT:    [[I8:%.*]] = addrspacecast ptr addrspace(3) [[I77]] to ptr
 ; IR-NEXT:    [[I9:%.*]] = load float, ptr [[I8]], align 4
 ; IR-NEXT:    [[I10:%.*]] = fadd float [[I5]], [[I9]]
-; IR-NEXT:    [[I124:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i64 32
-; IR-NEXT:    [[I13:%.*]] = addrspacecast ptr addrspace(3) [[I124]] to ptr
+; IR-NEXT:    [[I1212:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i32 32
+; IR-NEXT:    [[I13:%.*]] = addrspacecast ptr addrspace(3) [[I1212]] to ptr
 ; IR-NEXT:    [[I14:%.*]] = load float, ptr [[I13]], align 4
 ; IR-NEXT:    [[I15:%.*]] = fadd float [[I10]], [[I14]]
-; IR-NEXT:    [[I167:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i64 33
-; IR-NEXT:    [[I17:%.*]] = addrspacecast ptr addrspace(3) [[I167]] to ptr
+; IR-NEXT:    [[I1618:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i32 33
+; IR-NEXT:    [[I17:%.*]] = addrspacecast ptr addrspace(3) [[I1618]] to ptr
 ; IR-NEXT:    [[I18:%.*]] = load float, ptr [[I17]], align 4
 ; IR-NEXT:    [[I19:%.*]] = fadd float [[I15]], [[I18]]
 ; IR-NEXT:    store float [[I19]], ptr [[OUTPUT]], align 4
@@ -145,20 +145,20 @@ define void @sum_of_array3(i32 %x, i32 %y, ptr nocapture %output) {
 ; IR-NEXT:  .preheader:
 ; IR-NEXT:    [[I:%.*]] = zext i32 [[Y]] to i64
 ; IR-NEXT:    [[I1:%.*]] = zext i32 [[X]] to i64
-; IR-NEXT:    [[I2:%.*]] = getelementptr [32 x [32 x float]], ptr addrspace(3) @array, i64 0, i64 [[I1]], i64 [[I]]
+; IR-NEXT:    [[I2:%.*]] = getelementptr [32 x [32 x float]], ptr addrspace(3) @array, i32 0, i32 [[X]], i32 [[Y]]
 ; IR-NEXT:    [[I3:%.*]] = addrspacecast ptr addrspace(3) [[I2]] to ptr
 ; IR-NEXT:    [[I4:%.*]] = load float, ptr [[I3]], align 4
 ; IR-NEXT:    [[I5:%.*]] = fadd float [[I4]], 0.000000e+00
-; IR-NEXT:    [[I82:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i64 1
-; IR-NEXT:    [[I9:%.*]] = addrspacecast ptr addrspace(3) [[I82]] to ptr
+; IR-NEXT:    [[I87:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i32 1
+; IR-NEXT:    [[I9:%.*]] = addrspacecast ptr addrspace(3) [[I87]] to ptr
 ; IR-NEXT:    [[I10:%.*]] = load float, ptr [[I9]], align 4
 ; IR-NEXT:    [[I11:%.*]] = fadd float [[I5]], [[I10]]
-; IR-NEXT:    [[I144:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i64 32
-; IR-NEXT:    [[I15:%.*]] = addrspacecast ptr addrspace(3) [[I144]] to ptr
+; IR-NEXT:    [[I1412:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i32 32
+; IR-NEXT:    [[I15:%.*]] = addrspacecast ptr addrspace(3) [[I1412]] to ptr
 ; IR-NEXT:    [[I16:%.*]] = load float, ptr [[I15]], align 4
 ; IR-NEXT:    [[I17:%.*]] = fadd float [[I11]], [[I16]]
-; IR-NEXT:    [[I187:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i64 33
-; IR-NEXT:    [[I19:%.*]] = addrspacecast ptr addrspace(3) [[I187]] to ptr
+; IR-NEXT:    [[I1818:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i32 33
+; IR-NEXT:    [[I19:%.*]] = addrspacecast ptr addrspace(3) [[I1818]] to ptr
 ; IR-NEXT:    [[I20:%.*]] = load float, ptr [[I19]], align 4
 ; IR-NEXT:    [[I21:%.*]] = fadd float [[I17]], [[I20]]
 ; IR-NEXT:    store float [[I21]], ptr [[OUTPUT]], align 4
@@ -205,20 +205,20 @@ define void @sum_of_array4(i32 %x, i32 %y, ptr nocapture %output) {
 ; IR-NEXT:  .preheader:
 ; IR-NEXT:    [[I:%.*]] = zext i32 [[Y]] to i64
 ; IR-NEXT:    [[I1:%.*]] = zext i32 [[X]] to i64
-; IR-NEXT:    [[I2:%.*]] = getelementptr [32 x [32 x float]], ptr addrspace(3) @array, i64 0, i64 [[I1]], i64 [[I]]
+; IR-NEXT:    [[I2:%.*]] = getelementptr [32 x [32 x float]], ptr addrspace(3) @array, i32 0, i32 [[X]], i32 [[Y]]
 ; IR-NEXT:    [[I3:%.*]] = addrspacecast ptr addrspace(3) [[I2]] to ptr
 ; IR-NEXT:    [[I4:%.*]] = load float, ptr [[I3]], align 4
 ; IR-NEXT:    [[I5:%.*]] = fadd float [[I4]], 0.000000e+00
-; IR-NEXT:    [[I72:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i64 1
-; IR-NEXT:    [[I8:%.*]] = addrspacecast ptr addrspace(3) [[I72]] to ptr
+; IR-NEXT:    [[I77:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i32 1
+; IR-NEXT:    [[I8:%.*]] = addrspacecast ptr addrspace(3) [[I77]] to ptr
 ; IR-NEXT:    [[I9:%.*]] = load float, ptr [[I8]], align 4
 ; IR-NEXT:    [[I10:%.*]] = fadd float [[I5]], [[I9]]
-; IR-NEXT:    [[I124:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i64 32
-; IR-NEXT:    [[I13:%.*]] = addrspacecast ptr addrspace(3) [[I124]] to ptr
+; IR-NEXT:    [[I1212:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i32 32
+; IR-NEXT:    [[I13:%.*]] = addrspacecast ptr addrspace(3) [[I1212]] to ptr
 ; IR-NEXT:    [[I14:%.*]] = load float, ptr [[I13]], align 4
 ; IR-NEXT:    [[I15:%.*]] = fadd float [[I10]], [[I14]]
-; IR-NEXT:    [[I167:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i64 33
-; IR-NEXT:    [[I17:%.*]] = addrspacecast ptr addrspace(3) [[I167]] to ptr
+; IR-NEXT:    [[I1618:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i32 33
+; IR-NEXT:    [[I17:%.*]] = addrspacecast ptr addrspace(3) [[I1618]] to ptr
 ; IR-NEXT:    [[I18:%.*]] = load float, ptr [[I17]], align 4
 ; IR-NEXT:    [[I19:%.*]] = fadd float [[I15]], [[I18]]
 ; IR-NEXT:    store float [[I19]], ptr [[OUTPUT]], align 4

diff  --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep.ll
index 60d005be3d73ef7..1b220ceac35144f 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep.ll
@@ -12,10 +12,9 @@
 define ptr addrspace(3) @packed_struct(i32 %i, i32 %j) {
 ; CHECK-LABEL: @packed_struct(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[I:%.*]] to i64
-; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[J:%.*]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr [1024 x %struct.Packed], ptr addrspace(3) @packed_struct_array, i64 0, i64 [[TMP0]], i32 1, i64 [[TMP1]]
-; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP2]], i64 100
+; CHECK-NEXT:    [[IDXPROM:%.*]] = trunc i64 0 to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr [1024 x %struct.Packed], ptr addrspace(3) @packed_struct_array, i32 [[IDXPROM]], i32 [[I:%.*]], i32 1, i32 [[J:%.*]]
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP0]], i32 100
 ; CHECK-NEXT:    ret ptr addrspace(3) [[UGLYGEP]]
 ;
 entry:

diff  --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll
index 5ae4ea9dad7de0a..67d78fc90512f59 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll
@@ -163,7 +163,7 @@ define ptr @expr(i64 %a, i64 %b, ptr %out) {
 ; CHECK-NEXT:    [[I2:%.*]] = add i64 [[B]], [[A]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr [32 x [32 x float]], ptr @float_2d_array, i64 0, i64 [[I2]], i64 0
 ; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i64 160
-; CHECK-NEXT:    store i64 [[B5]], ptr [[OUT]], align 4
+; CHECK-NEXT:    store i64 [[B5]], ptr [[OUT]], align 8
 ; CHECK-NEXT:    ret ptr [[P3]]
 ;
 entry:
@@ -358,8 +358,8 @@ define ptr @sign_mod_unsign(ptr %ptr, i64 %idx) {
 ; CHECK-SAME: ptr [[PTR:%.*]], i64 [[IDX:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr [[STRUCT0:%.*]], ptr [[PTR]], i64 0, i32 3, i64 [[IDX]], i32 1
-; CHECK-NEXT:    [[PTR22:%.*]] = getelementptr inbounds [[STRUCT2:%.*]], ptr [[TMP0]], i64 -3
-; CHECK-NEXT:    ret ptr [[PTR22]]
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 -64
+; CHECK-NEXT:    ret ptr [[UGLYGEP]]
 ;
 entry:
   %arrayidx = add nsw i64 %idx, -2
@@ -373,7 +373,7 @@ define ptr @trunk_explicit(ptr %ptr, i64 %idx) {
 ; CHECK-SAME: ptr [[PTR:%.*]], i64 [[IDX:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr [[STRUCT0:%.*]], ptr [[PTR]], i64 0, i32 3, i64 [[IDX]], i32 1
-; CHECK-NEXT:    [[PTR21:%.*]] = getelementptr inbounds [[STRUCT2:%.*]], ptr [[TMP0]], i64 151
+; CHECK-NEXT:    [[PTR21:%.*]] = getelementptr inbounds [[STRUCT2:%.*]], ptr [[TMP0]], i64 134
 ; CHECK-NEXT:    ret ptr [[PTR21]]
 ;
 entry:
@@ -390,7 +390,7 @@ define ptr @trunk_long_idx(ptr %ptr, i64 %idx) {
 ; CHECK-SAME: ptr [[PTR:%.*]], i64 [[IDX:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr [[STRUCT0:%.*]], ptr [[PTR]], i64 0, i32 3, i64 [[IDX]], i32 1
-; CHECK-NEXT:    [[PTR21:%.*]] = getelementptr inbounds [[STRUCT2:%.*]], ptr [[TMP0]], i64 151
+; CHECK-NEXT:    [[PTR21:%.*]] = getelementptr inbounds [[STRUCT2:%.*]], ptr [[TMP0]], i64 134
 ; CHECK-NEXT:    ret ptr [[PTR21]]
 ;
 entry:

diff  --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/RISCV/split-gep.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/RISCV/split-gep.ll
index 1f47d2f0c2c79b1..ed0a1185985cc63 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/RISCV/split-gep.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/RISCV/split-gep.ll
@@ -13,11 +13,11 @@ define i64 @test1(ptr %array, i64 %i, i64 %j)  {
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[I:%.*]], 5
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i64, ptr [[ARRAY:%.*]], i64 [[I]]
 ; CHECK-NEXT:    [[GEP4:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 5
-; CHECK-NEXT:    store i64 [[J:%.*]], ptr [[GEP4]], align 4
+; CHECK-NEXT:    store i64 [[J:%.*]], ptr [[GEP4]], align 8
 ; CHECK-NEXT:    [[GEP26:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 6
-; CHECK-NEXT:    store i64 [[J]], ptr [[GEP26]], align 4
+; CHECK-NEXT:    store i64 [[J]], ptr [[GEP26]], align 8
 ; CHECK-NEXT:    [[GEP38:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 35
-; CHECK-NEXT:    store i64 [[ADD]], ptr [[GEP38]], align 4
+; CHECK-NEXT:    store i64 [[ADD]], ptr [[GEP38]], align 8
 ; CHECK-NEXT:    ret i64 undef
 ;
 entry:
@@ -169,11 +169,11 @@ define i64 @test6(ptr %array, i64 %i, i64 %j) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[I:%.*]], 5
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i64, ptr [[ARRAY:%.*]], i64 [[J:%.*]]
-; CHECK-NEXT:    store i64 [[ADD]], ptr [[GEP]], align 4
+; CHECK-NEXT:    store i64 [[ADD]], ptr [[GEP]], align 8
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i64, ptr [[ARRAY]], i64 [[I]]
 ; CHECK-NEXT:    [[GEP52:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 6
-; CHECK-NEXT:    store i64 [[I]], ptr [[GEP52]], align 4
-; CHECK-NEXT:    store i64 [[I]], ptr [[TMP0]], align 4
+; CHECK-NEXT:    store i64 [[I]], ptr [[GEP52]], align 8
+; CHECK-NEXT:    store i64 [[I]], ptr [[TMP0]], align 8
 ; CHECK-NEXT:    ret i64 undef
 ;
 entry:

diff  --git a/llvm/test/Transforms/TypePromotion/ARM/pointers.ll b/llvm/test/Transforms/TypePromotion/ARM/pointers.ll
index 5781dc262ffd16f..362b4ec73401ca5 100644
--- a/llvm/test/Transforms/TypePromotion/ARM/pointers.ll
+++ b/llvm/test/Transforms/TypePromotion/ARM/pointers.ll
@@ -130,7 +130,7 @@ define i8 @call_pointer(i8 zeroext %x, i8 zeroext %y, ptr %a, ptr %b) {
 define i16 @pointer_to_pointer(ptr %arg, i16 zeroext %limit) {
 ; CHECK-LABEL: @pointer_to_pointer(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ADDR:%.*]] = load ptr, ptr [[ARG:%.*]], align 8
+; CHECK-NEXT:    [[ADDR:%.*]] = load ptr, ptr [[ARG:%.*]], align 4
 ; CHECK-NEXT:    [[VAL:%.*]] = load i16, ptr [[ADDR]], align 2
 ; CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[VAL]] to i32
 ; CHECK-NEXT:    [[ADD:%.*]] = add nuw i32 [[TMP0]], 7

diff  --git a/llvm/test/Transforms/VectorCombine/X86/insert-binop-with-constant-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/insert-binop-with-constant-inseltpoison.ll
index e5be89dd02b1632..05251cb829b2b22 100644
--- a/llvm/test/Transforms/VectorCombine/X86/insert-binop-with-constant-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/insert-binop-with-constant-inseltpoison.ll
@@ -26,7 +26,7 @@ define <2 x i64> @add_constant_not_undef_lane(i64 %x) {
 
 define <2 x i64> @add_constant_load(ptr %p) {
 ; CHECK-LABEL: @add_constant_load(
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 8
 ; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> poison, i64 [[LD]], i32 0
 ; CHECK-NEXT:    [[BO:%.*]] = add <2 x i64> [[INS]], <i64 42, i64 -42>
 ; CHECK-NEXT:    ret <2 x i64> [[BO]]
@@ -152,7 +152,7 @@ define <2 x i64> @shl_constant_op0_not_undef_lane(i64 %x) {
 
 define <2 x i64> @shl_constant_op0_load(ptr %p) {
 ; CHECK-LABEL: @shl_constant_op0_load(
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 8
 ; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> poison, i64 [[LD]], i32 1
 ; CHECK-NEXT:    [[BO:%.*]] = shl <2 x i64> <i64 undef, i64 2>, [[INS]]
 ; CHECK-NEXT:    ret <2 x i64> [[BO]]
@@ -203,7 +203,7 @@ define <2 x i64> @shl_constant_op1_not_undef_lane(i64 %x) {
 
 define <2 x i64> @shl_constant_op1_load(ptr %p) {
 ; CHECK-LABEL: @shl_constant_op1_load(
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 8
 ; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> poison, i64 [[LD]], i32 0
 ; CHECK-NEXT:    [[BO:%.*]] = shl nuw <2 x i64> [[INS]], <i64 5, i64 2>
 ; CHECK-NEXT:    ret <2 x i64> [[BO]]

diff  --git a/llvm/test/Transforms/VectorCombine/X86/insert-binop-with-constant.ll b/llvm/test/Transforms/VectorCombine/X86/insert-binop-with-constant.ll
index 2a0db43b49b88b6..21adcdfd29b8260 100644
--- a/llvm/test/Transforms/VectorCombine/X86/insert-binop-with-constant.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/insert-binop-with-constant.ll
@@ -26,7 +26,7 @@ define <2 x i64> @add_constant_not_undef_lane(i64 %x) {
 
 define <2 x i64> @add_constant_load(ptr %p) {
 ; CHECK-LABEL: @add_constant_load(
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 8
 ; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[LD]], i32 0
 ; CHECK-NEXT:    [[BO:%.*]] = add <2 x i64> [[INS]], <i64 42, i64 -42>
 ; CHECK-NEXT:    ret <2 x i64> [[BO]]
@@ -152,7 +152,7 @@ define <2 x i64> @shl_constant_op0_not_undef_lane(i64 %x) {
 
 define <2 x i64> @shl_constant_op0_load(ptr %p) {
 ; CHECK-LABEL: @shl_constant_op0_load(
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 8
 ; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[LD]], i32 1
 ; CHECK-NEXT:    [[BO:%.*]] = shl <2 x i64> <i64 undef, i64 2>, [[INS]]
 ; CHECK-NEXT:    ret <2 x i64> [[BO]]
@@ -203,7 +203,7 @@ define <2 x i64> @shl_constant_op1_not_undef_lane(i64 %x) {
 
 define <2 x i64> @shl_constant_op1_load(ptr %p) {
 ; CHECK-LABEL: @shl_constant_op1_load(
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 8
 ; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[LD]], i32 0
 ; CHECK-NEXT:    [[BO:%.*]] = shl nuw <2 x i64> [[INS]], <i64 5, i64 2>
 ; CHECK-NEXT:    ret <2 x i64> [[BO]]

diff  --git a/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp-inseltpoison.ll
index 225fa3c9670b373..14b517d613de40a 100644
--- a/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp-inseltpoison.ll
@@ -169,7 +169,7 @@ define <2 x i1> @constant_op1_i64_not_undef_lane(i64 %x) {
 
 define <2 x i1> @constant_op1_i64_load(ptr %p) {
 ; CHECK-LABEL: @constant_op1_i64_load(
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 8
 ; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> poison, i64 [[LD]], i32 0
 ; CHECK-NEXT:    [[R:%.*]] = icmp eq <2 x i64> [[INS]], <i64 42, i64 -42>
 ; CHECK-NEXT:    ret <2 x i1> [[R]]

diff  --git a/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp.ll b/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp.ll
index 11f85b2d4767207..e9ab99c3e0d9c44 100644
--- a/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp.ll
@@ -169,7 +169,7 @@ define <2 x i1> @constant_op1_i64_not_undef_lane(i64 %x) {
 
 define <2 x i1> @constant_op1_i64_load(ptr %p) {
 ; CHECK-LABEL: @constant_op1_i64_load(
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 8
 ; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[LD]], i32 0
 ; CHECK-NEXT:    [[R:%.*]] = icmp eq <2 x i64> [[INS]], <i64 42, i64 -42>
 ; CHECK-NEXT:    ret <2 x i1> [[R]]

diff  --git a/llvm/test/tools/opt/infer-data-layout.ll b/llvm/test/tools/opt/infer-data-layout.ll
new file mode 100644
index 000000000000000..d1bcfec489381a1
--- /dev/null
+++ b/llvm/test/tools/opt/infer-data-layout.ll
@@ -0,0 +1,13 @@
+; REQUIRES: x86-registered-target
+;; Check that we infer the correct datalayout from a target triple
+; RUN: opt -mtriple=i386-linux-gnu -S -passes=no-op-module < %s | FileCheck %s --check-prefix=LINUX
+; RUN: opt -mtriple=i386-apple-darwin -S -passes=no-op-module < %s | FileCheck %s --check-prefix=DARWIN
+; RUN: opt -mtriple=i386-windows-msvc -S -passes=no-op-module < %s | FileCheck %s --check-prefix=WINDOWS
+
+target datalayout = ""
+; LINUX: target datalayout = "e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-i128:128-f64:32:64-f80:32-n8:16:32-S128"
+; LINUX: target triple = "i386-unknown-linux-gnu"
+; DARWIN: target datalayout = "e-m:o-p:32:32-p270:32:32-p271:32:32-p272:64:64-i128:128-f64:32:64-f80:128-n8:16:32-S128"
+; DARWIN: target triple = "i386-apple-darwin"
+; WINDOWS: target datalayout = "e-m:x-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32-a:0:32-S32"
+; WINDOWS: target triple = "i386-unknown-windows-msvc"

diff  --git a/llvm/test/tools/opt/invalid-target.ll b/llvm/test/tools/opt/invalid-target.ll
new file mode 100644
index 000000000000000..67c9b3c2a0a47fc
--- /dev/null
+++ b/llvm/test/tools/opt/invalid-target.ll
@@ -0,0 +1,16 @@
+;; Check that invalid triples are handled correctly by opt.
+
+;; No diagnostics should be printed for an explicitly/implicitly empty triple
+; RUN: opt -S -passes=no-op-module -o /dev/null < %s 2>&1 | FileCheck %s --allow-empty --check-prefix=EMPTY
+; RUN: opt '-mtriple=' -S -passes=no-op-module -o /dev/null < %s 2>&1 | FileCheck %s --allow-empty --check-prefix=EMPTY
+; EMPTY-NOT: {{.+}}
+
+;; Using "unknown" as the architecture is explicitly allowed (but warns)
+; RUN: opt -mtriple=unknown -S -passes=no-op-module -o /dev/null < %s 2>&1 | FileCheck %s --check-prefix=UNKNOWN
+; UNKNOWN: opt: warning: failed to infer data layout: unable to get target for 'unknown', see --version and --triple.
+
+;; However, any other invalid target triple should cause the tool to fail:
+; RUN: not opt -mtriple=invalid -S -passes=no-op-module -o /dev/null < %s 2>&1 | FileCheck %s --check-prefix=INVALID
+; INVALID: opt: warning: failed to infer data layout: unable to get target for 'invalid', see --version and --triple.
+; INVALID-NEXT: opt: unrecognized architecture 'invalid' provided.
+; INVALID-EMPTY:

diff  --git a/llvm/tools/opt/opt.cpp b/llvm/tools/opt/opt.cpp
index 5f415e50d20f2c7..bb6627364442ef7 100644
--- a/llvm/tools/opt/opt.cpp
+++ b/llvm/tools/opt/opt.cpp
@@ -500,10 +500,36 @@ int main(int argc, char **argv) {
   std::unique_ptr<ToolOutputFile> RemarksFile = std::move(*RemarksFileOrErr);
 
   // Load the input module...
-  auto SetDataLayout = [](StringRef, StringRef) -> std::optional<std::string> {
-    if (ClDataLayout.empty())
+  auto SetDataLayout = [&](StringRef IRTriple,
+                           StringRef IRLayout) -> std::optional<std::string> {
+    // Data layout specified on the command line has the highest priority.
+    if (!ClDataLayout.empty())
+      return ClDataLayout;
+    // If an explicit data layout is already defined in the IR, don't infer.
+    if (!IRLayout.empty())
       return std::nullopt;
-    return ClDataLayout;
+
+    // If an explicit triple was specified (either in the IR or on the
+    // command line), use that to infer the default data layout. However, the
+    // command line target triple should override the IR file target triple.
+    std::string TripleStr =
+        TargetTriple.empty() ? IRTriple.str() : Triple::normalize(TargetTriple);
+    // If the triple string is still empty, we don't fall back to
+    // sys::getDefaultTargetTriple() since we do not want to have 
diff ering
+    // behaviour dependent on the configured default triple. Therefore, if the
+    // user did not pass -mtriple or define an explicit triple/datalayout in
+    // the IR, we should default to an empty (default) DataLayout.
+    if (TripleStr.empty())
+      return std::nullopt;
+    // Otherwise we infer the DataLayout from the target machine.
+    Expected<std::unique_ptr<TargetMachine>> ExpectedTM =
+        codegen::createTargetMachineForTriple(TripleStr, GetCodeGenOptLevel());
+    if (!ExpectedTM) {
+      errs() << argv[0] << ": warning: failed to infer data layout: "
+             << toString(ExpectedTM.takeError()) << "\n";
+      return std::nullopt;
+    }
+    return (*ExpectedTM)->createDataLayout().getStringRepresentation();
   };
   std::unique_ptr<Module> M;
   if (NoUpgradeDebugInfo)
@@ -531,7 +557,7 @@ int main(int argc, char **argv) {
     }
   }
 
-  // If we are supposed to override the target triple or data layout, do so now.
+  // If we are supposed to override the target triple, do so now.
   if (!TargetTriple.empty())
     M->setTargetTriple(Triple::normalize(TargetTriple));