[llvm] Scalarize binop of splats build vector/free (PR #65773)
Luke Lau via llvm-commits
llvm-commits at lists.llvm.org
Fri Sep 8 08:55:27 PDT 2023
https://github.com/lukel97 created https://github.com/llvm/llvm-project/pull/65773:
When scalarizing bin ops of splats, we treat the extract as free for a
splat_vector because its operand is already scalar, i.e.
(extract idx, (splat_vector x)) -> x.
The same also applies for a build_vector that's a splat:
(extract idx, (build_vector x x x x)) -> x.
This patch takes this into account, which enables scalarization for fixed
length vectors, since the current canonical form for a splatted fixed length
vector is still build_vector.
This improves what we were seeing on RISC-V in #65068, but unfortunately causes
some patterns to be missed on other targets. One big one is that on AArch64
and X86 scalarizing (xor (splat x), (splat -1)) to (splat (xor x, -1)) prevents
vnot from being matched, which for example prevents bif from being matched.
Posting this patch as a WIP to show my findings.
>From da3bf57909bf62478627aa1256d00e35e9d32a73 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Fri, 8 Sep 2023 13:46:23 +0100
Subject: [PATCH 1/2] [RISCV] Add tests where bin ops of splats could be
scalarized. NFC
This adds tests for fixed and scalable vectors where we have a binary op on two
splats that could be scalarized. Normally this would be scalarized in the
middle-end by VectorCombine, but as noted in https://reviews.llvm.org/D159190,
this pattern can crop up during CodeGen afterwards.
Note that a combine already exists for this, but currently it only works on
scalable vectors where the element type == XLEN. See #65068 and #65072
---
llvm/test/CodeGen/RISCV/rvv/binop-splats.ll | 577 +++++++++++++++++
.../RISCV/rvv/fixed-vectors-binop-splats.ll | 592 ++++++++++++++++++
2 files changed, 1169 insertions(+)
create mode 100644 llvm/test/CodeGen/RISCV/rvv/binop-splats.ll
create mode 100644 llvm/test/CodeGen/RISCV/rvv/fixed-vectors-binop-splats.ll
diff --git a/llvm/test/CodeGen/RISCV/rvv/binop-splats.ll b/llvm/test/CodeGen/RISCV/rvv/binop-splats.ll
new file mode 100644
index 000000000000000..6ef2178ff593ec5
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/binop-splats.ll
@@ -0,0 +1,577 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc < %s -mtriple=riscv32 -mattr=+v -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,RV32
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,RV64
+
+define <vscale x 1 x i1> @nxv1i1(i1 %x, i1 %y) {
+; CHECK-LABEL: nxv1i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: andi a0, a0, 1
+; CHECK-NEXT: vsetvli a2, zero, e8, mf8, ta, ma
+; CHECK-NEXT: vmv.v.x v8, a0
+; CHECK-NEXT: vmsne.vi v8, v8, 0
+; CHECK-NEXT: andi a1, a1, 1
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vmsne.vi v9, v9, 0
+; CHECK-NEXT: vmxor.mm v0, v8, v9
+; CHECK-NEXT: ret
+ %head.x = insertelement <vscale x 1 x i1> poison, i1 %x, i32 0
+ %splat.x = shufflevector <vscale x 1 x i1> %head.x, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+ %head.y = insertelement <vscale x 1 x i1> poison, i1 %y, i32 0
+ %splat.y = shufflevector <vscale x 1 x i1> %head.y, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+ %v = add <vscale x 1 x i1> %splat.x, %splat.y
+ ret <vscale x 1 x i1> %v
+}
+
+define <vscale x 2 x i1> @nxv2i1(i1 %x, i1 %y) {
+; CHECK-LABEL: nxv2i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: andi a0, a0, 1
+; CHECK-NEXT: vsetvli a2, zero, e8, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v8, a0
+; CHECK-NEXT: vmsne.vi v8, v8, 0
+; CHECK-NEXT: andi a1, a1, 1
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vmsne.vi v9, v9, 0
+; CHECK-NEXT: vmxor.mm v0, v8, v9
+; CHECK-NEXT: ret
+ %head.x = insertelement <vscale x 2 x i1> poison, i1 %x, i32 0
+ %splat.x = shufflevector <vscale x 2 x i1> %head.x, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+ %head.y = insertelement <vscale x 2 x i1> poison, i1 %y, i32 0
+ %splat.y = shufflevector <vscale x 2 x i1> %head.y, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+ %v = add <vscale x 2 x i1> %splat.x, %splat.y
+ ret <vscale x 2 x i1> %v
+}
+
+define <vscale x 4 x i1> @nxv4i1(i1 %x, i1 %y) {
+; CHECK-LABEL: nxv4i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: andi a0, a0, 1
+; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vmv.v.x v8, a0
+; CHECK-NEXT: vmsne.vi v8, v8, 0
+; CHECK-NEXT: andi a1, a1, 1
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vmsne.vi v9, v9, 0
+; CHECK-NEXT: vmxor.mm v0, v8, v9
+; CHECK-NEXT: ret
+ %head.x = insertelement <vscale x 4 x i1> poison, i1 %x, i32 0
+ %splat.x = shufflevector <vscale x 4 x i1> %head.x, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+ %head.y = insertelement <vscale x 4 x i1> poison, i1 %y, i32 0
+ %splat.y = shufflevector <vscale x 4 x i1> %head.y, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+ %v = add <vscale x 4 x i1> %splat.x, %splat.y
+ ret <vscale x 4 x i1> %v
+}
+
+define <vscale x 8 x i1> @nxv8i1(i1 %x, i1 %y) {
+; CHECK-LABEL: nxv8i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: andi a0, a0, 1
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT: vmv.v.x v8, a0
+; CHECK-NEXT: vmsne.vi v8, v8, 0
+; CHECK-NEXT: andi a1, a1, 1
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vmsne.vi v9, v9, 0
+; CHECK-NEXT: vmxor.mm v0, v8, v9
+; CHECK-NEXT: ret
+ %head.x = insertelement <vscale x 8 x i1> poison, i1 %x, i32 0
+ %splat.x = shufflevector <vscale x 8 x i1> %head.x, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+ %head.y = insertelement <vscale x 8 x i1> poison, i1 %y, i32 0
+ %splat.y = shufflevector <vscale x 8 x i1> %head.y, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+ %v = add <vscale x 8 x i1> %splat.x, %splat.y
+ ret <vscale x 8 x i1> %v
+}
+
+define <vscale x 16 x i1> @nxv16i1(i1 %x, i1 %y) {
+; CHECK-LABEL: nxv16i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: andi a0, a0, 1
+; CHECK-NEXT: vsetvli a2, zero, e8, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v8, a0
+; CHECK-NEXT: vmsne.vi v10, v8, 0
+; CHECK-NEXT: andi a1, a1, 1
+; CHECK-NEXT: vmv.v.x v8, a1
+; CHECK-NEXT: vmsne.vi v11, v8, 0
+; CHECK-NEXT: vmxor.mm v0, v10, v11
+; CHECK-NEXT: ret
+ %head.x = insertelement <vscale x 16 x i1> poison, i1 %x, i32 0
+ %splat.x = shufflevector <vscale x 16 x i1> %head.x, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+ %head.y = insertelement <vscale x 16 x i1> poison, i1 %y, i32 0
+ %splat.y = shufflevector <vscale x 16 x i1> %head.y, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+ %v = add <vscale x 16 x i1> %splat.x, %splat.y
+ ret <vscale x 16 x i1> %v
+}
+
+define <vscale x 32 x i1> @nxv32i1(i1 %x, i1 %y) {
+; CHECK-LABEL: nxv32i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: andi a0, a0, 1
+; CHECK-NEXT: vsetvli a2, zero, e8, m4, ta, ma
+; CHECK-NEXT: vmv.v.x v8, a0
+; CHECK-NEXT: vmsne.vi v12, v8, 0
+; CHECK-NEXT: andi a1, a1, 1
+; CHECK-NEXT: vmv.v.x v8, a1
+; CHECK-NEXT: vmsne.vi v13, v8, 0
+; CHECK-NEXT: vmxor.mm v0, v12, v13
+; CHECK-NEXT: ret
+ %head.x = insertelement <vscale x 32 x i1> poison, i1 %x, i32 0
+ %splat.x = shufflevector <vscale x 32 x i1> %head.x, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+ %head.y = insertelement <vscale x 32 x i1> poison, i1 %y, i32 0
+ %splat.y = shufflevector <vscale x 32 x i1> %head.y, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+ %v = add <vscale x 32 x i1> %splat.x, %splat.y
+ ret <vscale x 32 x i1> %v
+}
+
+define <vscale x 64 x i1> @nxv64i1(i1 %x, i1 %y) {
+; CHECK-LABEL: nxv64i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: andi a0, a0, 1
+; CHECK-NEXT: vsetvli a2, zero, e8, m8, ta, ma
+; CHECK-NEXT: vmv.v.x v8, a0
+; CHECK-NEXT: vmsne.vi v16, v8, 0
+; CHECK-NEXT: andi a1, a1, 1
+; CHECK-NEXT: vmv.v.x v8, a1
+; CHECK-NEXT: vmsne.vi v17, v8, 0
+; CHECK-NEXT: vmxor.mm v0, v16, v17
+; CHECK-NEXT: ret
+ %head.x = insertelement <vscale x 64 x i1> poison, i1 %x, i32 0
+ %splat.x = shufflevector <vscale x 64 x i1> %head.x, <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer
+ %head.y = insertelement <vscale x 64 x i1> poison, i1 %y, i32 0
+ %splat.y = shufflevector <vscale x 64 x i1> %head.y, <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer
+ %v = add <vscale x 64 x i1> %splat.x, %splat.y
+ ret <vscale x 64 x i1> %v
+}
+
+define <vscale x 1 x i8> @nxv1i8(i8 %x, i8 %y) {
+; CHECK-LABEL: nxv1i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e8, mf8, ta, ma
+; CHECK-NEXT: vmv.v.x v8, a0
+; CHECK-NEXT: vadd.vx v8, v8, a1
+; CHECK-NEXT: ret
+ %head.x = insertelement <vscale x 1 x i8> poison, i8 %x, i32 0
+ %splat.x = shufflevector <vscale x 1 x i8> %head.x, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+ %head.y = insertelement <vscale x 1 x i8> poison, i8 %y, i32 0
+ %splat.y = shufflevector <vscale x 1 x i8> %head.y, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+ %v = add <vscale x 1 x i8> %splat.x, %splat.y
+ ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 2 x i8> @nxv2i8(i8 %x, i8 %y) {
+; CHECK-LABEL: nxv2i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e8, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v8, a0
+; CHECK-NEXT: vadd.vx v8, v8, a1
+; CHECK-NEXT: ret
+ %head.x = insertelement <vscale x 2 x i8> poison, i8 %x, i32 0
+ %splat.x = shufflevector <vscale x 2 x i8> %head.x, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+ %head.y = insertelement <vscale x 2 x i8> poison, i8 %y, i32 0
+ %splat.y = shufflevector <vscale x 2 x i8> %head.y, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+ %v = add <vscale x 2 x i8> %splat.x, %splat.y
+ ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 4 x i8> @nxv4i8(i8 %x, i8 %y) {
+; CHECK-LABEL: nxv4i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vmv.v.x v8, a0
+; CHECK-NEXT: vadd.vx v8, v8, a1
+; CHECK-NEXT: ret
+ %head.x = insertelement <vscale x 4 x i8> poison, i8 %x, i32 0
+ %splat.x = shufflevector <vscale x 4 x i8> %head.x, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+ %head.y = insertelement <vscale x 4 x i8> poison, i8 %y, i32 0
+ %splat.y = shufflevector <vscale x 4 x i8> %head.y, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+ %v = add <vscale x 4 x i8> %splat.x, %splat.y
+ ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 8 x i8> @nxv8i8(i8 %x, i8 %y) {
+; CHECK-LABEL: nxv8i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT: vmv.v.x v8, a0
+; CHECK-NEXT: vadd.vx v8, v8, a1
+; CHECK-NEXT: ret
+ %head.x = insertelement <vscale x 8 x i8> poison, i8 %x, i32 0
+ %splat.x = shufflevector <vscale x 8 x i8> %head.x, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+ %head.y = insertelement <vscale x 8 x i8> poison, i8 %y, i32 0
+ %splat.y = shufflevector <vscale x 8 x i8> %head.y, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+ %v = add <vscale x 8 x i8> %splat.x, %splat.y
+ ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 16 x i8> @nxv16i8(i8 %x, i8 %y) {
+; CHECK-LABEL: nxv16i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e8, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v8, a0
+; CHECK-NEXT: vadd.vx v8, v8, a1
+; CHECK-NEXT: ret
+ %head.x = insertelement <vscale x 16 x i8> poison, i8 %x, i32 0
+ %splat.x = shufflevector <vscale x 16 x i8> %head.x, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+ %head.y = insertelement <vscale x 16 x i8> poison, i8 %y, i32 0
+ %splat.y = shufflevector <vscale x 16 x i8> %head.y, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+ %v = add <vscale x 16 x i8> %splat.x, %splat.y
+ ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 32 x i8> @nxv32i8(i8 %x, i8 %y) {
+; CHECK-LABEL: nxv32i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e8, m4, ta, ma
+; CHECK-NEXT: vmv.v.x v8, a0
+; CHECK-NEXT: vadd.vx v8, v8, a1
+; CHECK-NEXT: ret
+ %head.x = insertelement <vscale x 32 x i8> poison, i8 %x, i32 0
+ %splat.x = shufflevector <vscale x 32 x i8> %head.x, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
+ %head.y = insertelement <vscale x 32 x i8> poison, i8 %y, i32 0
+ %splat.y = shufflevector <vscale x 32 x i8> %head.y, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
+ %v = add <vscale x 32 x i8> %splat.x, %splat.y
+ ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 64 x i8> @nxv64i8(i8 %x, i8 %y) {
+; CHECK-LABEL: nxv64i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e8, m8, ta, ma
+; CHECK-NEXT: vmv.v.x v8, a0
+; CHECK-NEXT: vadd.vx v8, v8, a1
+; CHECK-NEXT: ret
+ %head.x = insertelement <vscale x 64 x i8> poison, i8 %x, i32 0
+ %splat.x = shufflevector <vscale x 64 x i8> %head.x, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
+ %head.y = insertelement <vscale x 64 x i8> poison, i8 %y, i32 0
+ %splat.y = shufflevector <vscale x 64 x i8> %head.y, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
+ %v = add <vscale x 64 x i8> %splat.x, %splat.y
+ ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 1 x i16> @nxv1i16(i16 %x, i16 %y) {
+; CHECK-LABEL: nxv1i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v8, a0
+; CHECK-NEXT: vadd.vx v8, v8, a1
+; CHECK-NEXT: ret
+ %head.x = insertelement <vscale x 1 x i16> poison, i16 %x, i32 0
+ %splat.x = shufflevector <vscale x 1 x i16> %head.x, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+ %head.y = insertelement <vscale x 1 x i16> poison, i16 %y, i32 0
+ %splat.y = shufflevector <vscale x 1 x i16> %head.y, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+ %v = add <vscale x 1 x i16> %splat.x, %splat.y
+ ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 2 x i16> @nxv2i16(i16 %x, i16 %y) {
+; CHECK-LABEL: nxv2i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vmv.v.x v8, a0
+; CHECK-NEXT: vadd.vx v8, v8, a1
+; CHECK-NEXT: ret
+ %head.x = insertelement <vscale x 2 x i16> poison, i16 %x, i32 0
+ %splat.x = shufflevector <vscale x 2 x i16> %head.x, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+ %head.y = insertelement <vscale x 2 x i16> poison, i16 %y, i32 0
+ %splat.y = shufflevector <vscale x 2 x i16> %head.y, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+ %v = add <vscale x 2 x i16> %splat.x, %splat.y
+ ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 4 x i16> @nxv4i16(i16 %x, i16 %y) {
+; CHECK-LABEL: nxv4i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma
+; CHECK-NEXT: vmv.v.x v8, a0
+; CHECK-NEXT: vadd.vx v8, v8, a1
+; CHECK-NEXT: ret
+ %head.x = insertelement <vscale x 4 x i16> poison, i16 %x, i32 0
+ %splat.x = shufflevector <vscale x 4 x i16> %head.x, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+ %head.y = insertelement <vscale x 4 x i16> poison, i16 %y, i32 0
+ %splat.y = shufflevector <vscale x 4 x i16> %head.y, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+ %v = add <vscale x 4 x i16> %splat.x, %splat.y
+ ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 8 x i16> @nxv8i16(i16 %x, i16 %y) {
+; CHECK-LABEL: nxv8i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v8, a0
+; CHECK-NEXT: vadd.vx v8, v8, a1
+; CHECK-NEXT: ret
+ %head.x = insertelement <vscale x 8 x i16> poison, i16 %x, i32 0
+ %splat.x = shufflevector <vscale x 8 x i16> %head.x, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+ %head.y = insertelement <vscale x 8 x i16> poison, i16 %y, i32 0
+ %splat.y = shufflevector <vscale x 8 x i16> %head.y, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+ %v = add <vscale x 8 x i16> %splat.x, %splat.y
+ ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 16 x i16> @nxv16i16(i16 %x, i16 %y) {
+; CHECK-LABEL: nxv16i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT: vmv.v.x v8, a0
+; CHECK-NEXT: vadd.vx v8, v8, a1
+; CHECK-NEXT: ret
+ %head.x = insertelement <vscale x 16 x i16> poison, i16 %x, i32 0
+ %splat.x = shufflevector <vscale x 16 x i16> %head.x, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+ %head.y = insertelement <vscale x 16 x i16> poison, i16 %y, i32 0
+ %splat.y = shufflevector <vscale x 16 x i16> %head.y, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+ %v = add <vscale x 16 x i16> %splat.x, %splat.y
+ ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 32 x i16> @nxv32i16(i16 %x, i16 %y) {
+; CHECK-LABEL: nxv32i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a2, zero, e16, m8, ta, ma
+; CHECK-NEXT: vmv.v.x v8, a0
+; CHECK-NEXT: vadd.vx v8, v8, a1
+; CHECK-NEXT: ret
+ %head.x = insertelement <vscale x 32 x i16> poison, i16 %x, i32 0
+ %splat.x = shufflevector <vscale x 32 x i16> %head.x, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+ %head.y = insertelement <vscale x 32 x i16> poison, i16 %y, i32 0
+ %splat.y = shufflevector <vscale x 32 x i16> %head.y, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+ %v = add <vscale x 32 x i16> %splat.x, %splat.y
+ ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 1 x i32> @nxv1i32(i32 %x, i32 %y) {
+; RV32-LABEL: nxv1i32:
+; RV32: # %bb.0:
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: vsetvli a1, zero, e32, mf2, ta, ma
+; RV32-NEXT: vmv.v.x v8, a0
+; RV32-NEXT: ret
+;
+; RV64-LABEL: nxv1i32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a2, zero, e32, mf2, ta, ma
+; RV64-NEXT: vmv.v.x v8, a0
+; RV64-NEXT: vadd.vx v8, v8, a1
+; RV64-NEXT: ret
+ %head.x = insertelement <vscale x 1 x i32> poison, i32 %x, i32 0
+ %splat.x = shufflevector <vscale x 1 x i32> %head.x, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+ %head.y = insertelement <vscale x 1 x i32> poison, i32 %y, i32 0
+ %splat.y = shufflevector <vscale x 1 x i32> %head.y, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+ %v = add <vscale x 1 x i32> %splat.x, %splat.y
+ ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 2 x i32> @nxv2i32(i32 %x, i32 %y) {
+; RV32-LABEL: nxv2i32:
+; RV32: # %bb.0:
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma
+; RV32-NEXT: vmv.v.x v8, a0
+; RV32-NEXT: ret
+;
+; RV64-LABEL: nxv2i32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a2, zero, e32, m1, ta, ma
+; RV64-NEXT: vmv.v.x v8, a0
+; RV64-NEXT: vadd.vx v8, v8, a1
+; RV64-NEXT: ret
+ %head.x = insertelement <vscale x 2 x i32> poison, i32 %x, i32 0
+ %splat.x = shufflevector <vscale x 2 x i32> %head.x, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+ %head.y = insertelement <vscale x 2 x i32> poison, i32 %y, i32 0
+ %splat.y = shufflevector <vscale x 2 x i32> %head.y, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+ %v = add <vscale x 2 x i32> %splat.x, %splat.y
+ ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 4 x i32> @nxv4i32(i32 %x, i32 %y) {
+; RV32-LABEL: nxv4i32:
+; RV32: # %bb.0:
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: vsetvli a1, zero, e32, m2, ta, ma
+; RV32-NEXT: vmv.v.x v8, a0
+; RV32-NEXT: ret
+;
+; RV64-LABEL: nxv4i32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a2, zero, e32, m2, ta, ma
+; RV64-NEXT: vmv.v.x v8, a0
+; RV64-NEXT: vadd.vx v8, v8, a1
+; RV64-NEXT: ret
+ %head.x = insertelement <vscale x 4 x i32> poison, i32 %x, i32 0
+ %splat.x = shufflevector <vscale x 4 x i32> %head.x, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+ %head.y = insertelement <vscale x 4 x i32> poison, i32 %y, i32 0
+ %splat.y = shufflevector <vscale x 4 x i32> %head.y, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+ %v = add <vscale x 4 x i32> %splat.x, %splat.y
+ ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 8 x i32> @nxv8i32(i32 %x, i32 %y) {
+; RV32-LABEL: nxv8i32:
+; RV32: # %bb.0:
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: vsetvli a1, zero, e32, m4, ta, ma
+; RV32-NEXT: vmv.v.x v8, a0
+; RV32-NEXT: ret
+;
+; RV64-LABEL: nxv8i32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a2, zero, e32, m4, ta, ma
+; RV64-NEXT: vmv.v.x v8, a0
+; RV64-NEXT: vadd.vx v8, v8, a1
+; RV64-NEXT: ret
+ %head.x = insertelement <vscale x 8 x i32> poison, i32 %x, i32 0
+ %splat.x = shufflevector <vscale x 8 x i32> %head.x, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+ %head.y = insertelement <vscale x 8 x i32> poison, i32 %y, i32 0
+ %splat.y = shufflevector <vscale x 8 x i32> %head.y, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+ %v = add <vscale x 8 x i32> %splat.x, %splat.y
+ ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 16 x i32> @nxv16i32(i32 %x, i32 %y) {
+; RV32-LABEL: nxv16i32:
+; RV32: # %bb.0:
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT: vmv.v.x v8, a0
+; RV32-NEXT: ret
+;
+; RV64-LABEL: nxv16i32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a2, zero, e32, m8, ta, ma
+; RV64-NEXT: vmv.v.x v8, a0
+; RV64-NEXT: vadd.vx v8, v8, a1
+; RV64-NEXT: ret
+ %head.x = insertelement <vscale x 16 x i32> poison, i32 %x, i32 0
+ %splat.x = shufflevector <vscale x 16 x i32> %head.x, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+ %head.y = insertelement <vscale x 16 x i32> poison, i32 %y, i32 0
+ %splat.y = shufflevector <vscale x 16 x i32> %head.y, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+ %v = add <vscale x 16 x i32> %splat.x, %splat.y
+ ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 1 x i64> @nxv1i64(i64 %x, i64 %y) {
+; RV32-LABEL: nxv1i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw a1, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT: vlse64.v v8, (a0), zero
+; RV32-NEXT: sw a3, 4(sp)
+; RV32-NEXT: sw a2, 0(sp)
+; RV32-NEXT: mv a0, sp
+; RV32-NEXT: vlse64.v v9, (a0), zero
+; RV32-NEXT: vadd.vv v8, v8, v9
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: nxv1i64:
+; RV64: # %bb.0:
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: vsetvli a1, zero, e64, m1, ta, ma
+; RV64-NEXT: vmv.v.x v8, a0
+; RV64-NEXT: ret
+ %head.x = insertelement <vscale x 1 x i64> poison, i64 %x, i32 0
+ %splat.x = shufflevector <vscale x 1 x i64> %head.x, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+ %head.y = insertelement <vscale x 1 x i64> poison, i64 %y, i32 0
+ %splat.y = shufflevector <vscale x 1 x i64> %head.y, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+ %v = add <vscale x 1 x i64> %splat.x, %splat.y
+ ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 2 x i64> @nxv2i64(i64 %x, i64 %y) {
+; RV32-LABEL: nxv2i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw a1, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT: vlse64.v v8, (a0), zero
+; RV32-NEXT: sw a3, 4(sp)
+; RV32-NEXT: sw a2, 0(sp)
+; RV32-NEXT: mv a0, sp
+; RV32-NEXT: vlse64.v v10, (a0), zero
+; RV32-NEXT: vadd.vv v8, v8, v10
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: nxv2i64:
+; RV64: # %bb.0:
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: vsetvli a1, zero, e64, m2, ta, ma
+; RV64-NEXT: vmv.v.x v8, a0
+; RV64-NEXT: ret
+ %head.x = insertelement <vscale x 2 x i64> poison, i64 %x, i32 0
+ %splat.x = shufflevector <vscale x 2 x i64> %head.x, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+ %head.y = insertelement <vscale x 2 x i64> poison, i64 %y, i32 0
+ %splat.y = shufflevector <vscale x 2 x i64> %head.y, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+ %v = add <vscale x 2 x i64> %splat.x, %splat.y
+ ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 4 x i64> @nxv4i64(i64 %x, i64 %y) {
+; RV32-LABEL: nxv4i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw a1, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT: vlse64.v v8, (a0), zero
+; RV32-NEXT: sw a3, 4(sp)
+; RV32-NEXT: sw a2, 0(sp)
+; RV32-NEXT: mv a0, sp
+; RV32-NEXT: vlse64.v v12, (a0), zero
+; RV32-NEXT: vadd.vv v8, v8, v12
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: nxv4i64:
+; RV64: # %bb.0:
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: vsetvli a1, zero, e64, m4, ta, ma
+; RV64-NEXT: vmv.v.x v8, a0
+; RV64-NEXT: ret
+ %head.x = insertelement <vscale x 4 x i64> poison, i64 %x, i32 0
+ %splat.x = shufflevector <vscale x 4 x i64> %head.x, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+ %head.y = insertelement <vscale x 4 x i64> poison, i64 %y, i32 0
+ %splat.y = shufflevector <vscale x 4 x i64> %head.y, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+ %v = add <vscale x 4 x i64> %splat.x, %splat.y
+ ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 8 x i64> @nxv8i64(i64 %x, i64 %y) {
+; RV32-LABEL: nxv8i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw a1, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT: vlse64.v v8, (a0), zero
+; RV32-NEXT: sw a3, 4(sp)
+; RV32-NEXT: sw a2, 0(sp)
+; RV32-NEXT: mv a0, sp
+; RV32-NEXT: vlse64.v v16, (a0), zero
+; RV32-NEXT: vadd.vv v8, v8, v16
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: nxv8i64:
+; RV64: # %bb.0:
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, ma
+; RV64-NEXT: vmv.v.x v8, a0
+; RV64-NEXT: ret
+ %head.x = insertelement <vscale x 8 x i64> poison, i64 %x, i32 0
+ %splat.x = shufflevector <vscale x 8 x i64> %head.x, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+ %head.y = insertelement <vscale x 8 x i64> poison, i64 %y, i32 0
+ %splat.y = shufflevector <vscale x 8 x i64> %head.y, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+ %v = add <vscale x 8 x i64> %splat.x, %splat.y
+ ret <vscale x 8 x i64> %v
+}
+
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-binop-splats.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-binop-splats.ll
new file mode 100644
index 000000000000000..309ccd292fb6259
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-binop-splats.ll
@@ -0,0 +1,592 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc < %s -mtriple=riscv32 -mattr=+v -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,RV32
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,RV64
+
+define <1 x i1> @v1i1(i1 %x, i1 %y) {
+; CHECK-LABEL: v1i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: andi a0, a0, 1
+; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
+; CHECK-NEXT: vmv.s.x v8, a0
+; CHECK-NEXT: vmsne.vi v8, v8, 0
+; CHECK-NEXT: andi a1, a1, 1
+; CHECK-NEXT: vmv.s.x v9, a1
+; CHECK-NEXT: vmsne.vi v9, v9, 0
+; CHECK-NEXT: vmxor.mm v0, v8, v9
+; CHECK-NEXT: ret
+ %head.x = insertelement <1 x i1> poison, i1 %x, i32 0
+ %splat.x = shufflevector <1 x i1> %head.x, <1 x i1> poison, <1 x i32> zeroinitializer
+ %head.y = insertelement <1 x i1> poison, i1 %y, i32 0
+ %splat.y = shufflevector <1 x i1> %head.y, <1 x i1> poison, <1 x i32> zeroinitializer
+ %v = add <1 x i1> %splat.x, %splat.y
+ ret <1 x i1> %v
+}
+
+define <2 x i1> @v2i1(i1 %x, i1 %y) {
+; CHECK-LABEL: v2i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: andi a0, a0, 1
+; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
+; CHECK-NEXT: vmv.v.x v8, a0
+; CHECK-NEXT: vmsne.vi v8, v8, 0
+; CHECK-NEXT: andi a1, a1, 1
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vmsne.vi v9, v9, 0
+; CHECK-NEXT: vmxor.mm v0, v8, v9
+; CHECK-NEXT: vmv.v.i v8, 0
+; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT: vrgather.vi v9, v8, 0
+; CHECK-NEXT: vmsne.vi v0, v9, 0
+; CHECK-NEXT: ret
+ %head.x = insertelement <2 x i1> poison, i1 %x, i32 0
+ %splat.x = shufflevector <2 x i1> %head.x, <2 x i1> poison, <2 x i32> zeroinitializer
+ %head.y = insertelement <2 x i1> poison, i1 %y, i32 0
+ %splat.y = shufflevector <2 x i1> %head.y, <2 x i1> poison, <2 x i32> zeroinitializer
+ %v = add <2 x i1> %splat.x, %splat.y
+ ret <2 x i1> %v
+}
+
+define <4 x i1> @v4i1(i1 %x, i1 %y) {
+; CHECK-LABEL: v4i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: andi a0, a0, 1
+; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v8, a0
+; CHECK-NEXT: vmsne.vi v8, v8, 0
+; CHECK-NEXT: andi a1, a1, 1
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vmsne.vi v9, v9, 0
+; CHECK-NEXT: vmxor.mm v0, v8, v9
+; CHECK-NEXT: vmv.v.i v8, 0
+; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT: vrgather.vi v9, v8, 0
+; CHECK-NEXT: vmsne.vi v0, v9, 0
+; CHECK-NEXT: ret
+ %head.x = insertelement <4 x i1> poison, i1 %x, i32 0
+ %splat.x = shufflevector <4 x i1> %head.x, <4 x i1> poison, <4 x i32> zeroinitializer
+ %head.y = insertelement <4 x i1> poison, i1 %y, i32 0
+ %splat.y = shufflevector <4 x i1> %head.y, <4 x i1> poison, <4 x i32> zeroinitializer
+ %v = add <4 x i1> %splat.x, %splat.y
+ ret <4 x i1> %v
+}
+
+define <8 x i1> @v8i1(i1 %x, i1 %y) {
+; CHECK-LABEL: v8i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: andi a0, a0, 1
+; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT: vmv.v.x v8, a0
+; CHECK-NEXT: vmsne.vi v8, v8, 0
+; CHECK-NEXT: andi a1, a1, 1
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vmsne.vi v9, v9, 0
+; CHECK-NEXT: vmxor.mm v0, v8, v9
+; CHECK-NEXT: vmv.v.i v8, 0
+; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT: vrgather.vi v9, v8, 0
+; CHECK-NEXT: vmsne.vi v0, v9, 0
+; CHECK-NEXT: ret
+ %head.x = insertelement <8 x i1> poison, i1 %x, i32 0
+ %splat.x = shufflevector <8 x i1> %head.x, <8 x i1> poison, <8 x i32> zeroinitializer
+ %head.y = insertelement <8 x i1> poison, i1 %y, i32 0
+ %splat.y = shufflevector <8 x i1> %head.y, <8 x i1> poison, <8 x i32> zeroinitializer
+ %v = add <8 x i1> %splat.x, %splat.y
+ ret <8 x i1> %v
+}
+
+define <16 x i1> @v16i1(i1 %x, i1 %y) {
+; CHECK-LABEL: v16i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: andi a0, a0, 1
+; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT: vmv.v.x v8, a0
+; CHECK-NEXT: vmsne.vi v8, v8, 0
+; CHECK-NEXT: andi a1, a1, 1
+; CHECK-NEXT: vmv.v.x v9, a1
+; CHECK-NEXT: vmsne.vi v9, v9, 0
+; CHECK-NEXT: vmxor.mm v0, v8, v9
+; CHECK-NEXT: vmv.v.i v8, 0
+; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT: vrgather.vi v9, v8, 0
+; CHECK-NEXT: vmsne.vi v0, v9, 0
+; CHECK-NEXT: ret
+ %head.x = insertelement <16 x i1> poison, i1 %x, i32 0
+ %splat.x = shufflevector <16 x i1> %head.x, <16 x i1> poison, <16 x i32> zeroinitializer
+ %head.y = insertelement <16 x i1> poison, i1 %y, i32 0
+ %splat.y = shufflevector <16 x i1> %head.y, <16 x i1> poison, <16 x i32> zeroinitializer
+ %v = add <16 x i1> %splat.x, %splat.y
+ ret <16 x i1> %v
+}
+
+define <32 x i1> @v32i1(i1 %x, i1 %y) {
+; CHECK-LABEL: v32i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: andi a0, a0, 1
+; CHECK-NEXT: li a2, 32
+; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v8, a0
+; CHECK-NEXT: vmsne.vi v10, v8, 0
+; CHECK-NEXT: andi a1, a1, 1
+; CHECK-NEXT: vmv.v.x v8, a1
+; CHECK-NEXT: vmsne.vi v11, v8, 0
+; CHECK-NEXT: vmxor.mm v0, v10, v11
+; CHECK-NEXT: vmv.v.i v8, 0
+; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT: vrgather.vi v10, v8, 0
+; CHECK-NEXT: vmsne.vi v0, v10, 0
+; CHECK-NEXT: ret
+ %head.x = insertelement <32 x i1> poison, i1 %x, i32 0
+ %splat.x = shufflevector <32 x i1> %head.x, <32 x i1> poison, <32 x i32> zeroinitializer
+ %head.y = insertelement <32 x i1> poison, i1 %y, i32 0
+ %splat.y = shufflevector <32 x i1> %head.y, <32 x i1> poison, <32 x i32> zeroinitializer
+ %v = add <32 x i1> %splat.x, %splat.y
+ ret <32 x i1> %v
+}
+
+define <64 x i1> @v64i1(i1 %x, i1 %y) {
+; CHECK-LABEL: v64i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: andi a0, a0, 1
+; CHECK-NEXT: li a2, 64
+; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, ma
+; CHECK-NEXT: vmv.v.x v8, a0
+; CHECK-NEXT: vmsne.vi v12, v8, 0
+; CHECK-NEXT: andi a1, a1, 1
+; CHECK-NEXT: vmv.v.x v8, a1
+; CHECK-NEXT: vmsne.vi v13, v8, 0
+; CHECK-NEXT: vmxor.mm v0, v12, v13
+; CHECK-NEXT: vmv.v.i v8, 0
+; CHECK-NEXT: vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT: vrgather.vi v12, v8, 0
+; CHECK-NEXT: vmsne.vi v0, v12, 0
+; CHECK-NEXT: ret
+ %head.x = insertelement <64 x i1> poison, i1 %x, i32 0
+ %splat.x = shufflevector <64 x i1> %head.x, <64 x i1> poison, <64 x i32> zeroinitializer
+ %head.y = insertelement <64 x i1> poison, i1 %y, i32 0
+ %splat.y = shufflevector <64 x i1> %head.y, <64 x i1> poison, <64 x i32> zeroinitializer
+ %v = add <64 x i1> %splat.x, %splat.y
+ ret <64 x i1> %v
+}
+
+define <1 x i8> @v1i8(i8 %x, i8 %y) {
+; CHECK-LABEL: v1i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
+; CHECK-NEXT: vmv.s.x v8, a0
+; CHECK-NEXT: vadd.vx v8, v8, a1
+; CHECK-NEXT: ret
+ %head.x = insertelement <1 x i8> poison, i8 %x, i32 0
+ %splat.x = shufflevector <1 x i8> %head.x, <1 x i8> poison, <1 x i32> zeroinitializer
+ %head.y = insertelement <1 x i8> poison, i8 %y, i32 0
+ %splat.y = shufflevector <1 x i8> %head.y, <1 x i8> poison, <1 x i32> zeroinitializer
+ %v = add <1 x i8> %splat.x, %splat.y
+ ret <1 x i8> %v
+}
+
+define <2 x i8> @v2i8(i8 %x, i8 %y) {
+; CHECK-LABEL: v2i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
+; CHECK-NEXT: vmv.v.x v8, a0
+; CHECK-NEXT: vadd.vx v9, v8, a1
+; CHECK-NEXT: vrgather.vi v8, v9, 0
+; CHECK-NEXT: ret
+ %head.x = insertelement <2 x i8> poison, i8 %x, i32 0
+ %splat.x = shufflevector <2 x i8> %head.x, <2 x i8> poison, <2 x i32> zeroinitializer
+ %head.y = insertelement <2 x i8> poison, i8 %y, i32 0
+ %splat.y = shufflevector <2 x i8> %head.y, <2 x i8> poison, <2 x i32> zeroinitializer
+ %v = add <2 x i8> %splat.x, %splat.y
+ ret <2 x i8> %v
+}
+
+define <4 x i8> @v4i8(i8 %x, i8 %y) {
+; CHECK-LABEL: v4i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v8, a0
+; CHECK-NEXT: vadd.vx v9, v8, a1
+; CHECK-NEXT: vrgather.vi v8, v9, 0
+; CHECK-NEXT: ret
+ %head.x = insertelement <4 x i8> poison, i8 %x, i32 0
+ %splat.x = shufflevector <4 x i8> %head.x, <4 x i8> poison, <4 x i32> zeroinitializer
+ %head.y = insertelement <4 x i8> poison, i8 %y, i32 0
+ %splat.y = shufflevector <4 x i8> %head.y, <4 x i8> poison, <4 x i32> zeroinitializer
+ %v = add <4 x i8> %splat.x, %splat.y
+ ret <4 x i8> %v
+}
+
+define <8 x i8> @v8i8(i8 %x, i8 %y) {
+; CHECK-LABEL: v8i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT: vmv.v.x v8, a0
+; CHECK-NEXT: vadd.vx v9, v8, a1
+; CHECK-NEXT: vrgather.vi v8, v9, 0
+; CHECK-NEXT: ret
+ %head.x = insertelement <8 x i8> poison, i8 %x, i32 0
+ %splat.x = shufflevector <8 x i8> %head.x, <8 x i8> poison, <8 x i32> zeroinitializer
+ %head.y = insertelement <8 x i8> poison, i8 %y, i32 0
+ %splat.y = shufflevector <8 x i8> %head.y, <8 x i8> poison, <8 x i32> zeroinitializer
+ %v = add <8 x i8> %splat.x, %splat.y
+ ret <8 x i8> %v
+}
+
+define <16 x i8> @v16i8(i8 %x, i8 %y) {
+; CHECK-LABEL: v16i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT: vmv.v.x v8, a0
+; CHECK-NEXT: vadd.vx v9, v8, a1
+; CHECK-NEXT: vrgather.vi v8, v9, 0
+; CHECK-NEXT: ret
+ %head.x = insertelement <16 x i8> poison, i8 %x, i32 0
+ %splat.x = shufflevector <16 x i8> %head.x, <16 x i8> poison, <16 x i32> zeroinitializer
+ %head.y = insertelement <16 x i8> poison, i8 %y, i32 0
+ %splat.y = shufflevector <16 x i8> %head.y, <16 x i8> poison, <16 x i32> zeroinitializer
+ %v = add <16 x i8> %splat.x, %splat.y
+ ret <16 x i8> %v
+}
+
+define <32 x i8> @v32i8(i8 %x, i8 %y) {
+; CHECK-LABEL: v32i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a2, 32
+; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v8, a0
+; CHECK-NEXT: vadd.vx v10, v8, a1
+; CHECK-NEXT: vrgather.vi v8, v10, 0
+; CHECK-NEXT: ret
+ %head.x = insertelement <32 x i8> poison, i8 %x, i32 0
+ %splat.x = shufflevector <32 x i8> %head.x, <32 x i8> poison, <32 x i32> zeroinitializer
+ %head.y = insertelement <32 x i8> poison, i8 %y, i32 0
+ %splat.y = shufflevector <32 x i8> %head.y, <32 x i8> poison, <32 x i32> zeroinitializer
+ %v = add <32 x i8> %splat.x, %splat.y
+ ret <32 x i8> %v
+}
+
+define <64 x i8> @v64i8(i8 %x, i8 %y) {
+; CHECK-LABEL: v64i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a2, 64
+; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, ma
+; CHECK-NEXT: vmv.v.x v8, a0
+; CHECK-NEXT: vadd.vx v12, v8, a1
+; CHECK-NEXT: vrgather.vi v8, v12, 0
+; CHECK-NEXT: ret
+ %head.x = insertelement <64 x i8> poison, i8 %x, i32 0
+ %splat.x = shufflevector <64 x i8> %head.x, <64 x i8> poison, <64 x i32> zeroinitializer
+ %head.y = insertelement <64 x i8> poison, i8 %y, i32 0
+ %splat.y = shufflevector <64 x i8> %head.y, <64 x i8> poison, <64 x i32> zeroinitializer
+ %v = add <64 x i8> %splat.x, %splat.y
+ ret <64 x i8> %v
+}
+
+define <1 x i16> @v1i16(i16 %x, i16 %y) {
+; CHECK-LABEL: v1i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.s.x v8, a0
+; CHECK-NEXT: vadd.vx v8, v8, a1
+; CHECK-NEXT: ret
+ %head.x = insertelement <1 x i16> poison, i16 %x, i32 0
+ %splat.x = shufflevector <1 x i16> %head.x, <1 x i16> poison, <1 x i32> zeroinitializer
+ %head.y = insertelement <1 x i16> poison, i16 %y, i32 0
+ %splat.y = shufflevector <1 x i16> %head.y, <1 x i16> poison, <1 x i32> zeroinitializer
+ %v = add <1 x i16> %splat.x, %splat.y
+ ret <1 x i16> %v
+}
+
+define <2 x i16> @v2i16(i16 %x, i16 %y) {
+; CHECK-LABEL: v2i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT: vmv.v.x v8, a0
+; CHECK-NEXT: vadd.vx v9, v8, a1
+; CHECK-NEXT: vrgather.vi v8, v9, 0
+; CHECK-NEXT: ret
+ %head.x = insertelement <2 x i16> poison, i16 %x, i32 0
+ %splat.x = shufflevector <2 x i16> %head.x, <2 x i16> poison, <2 x i32> zeroinitializer
+ %head.y = insertelement <2 x i16> poison, i16 %y, i32 0
+ %splat.y = shufflevector <2 x i16> %head.y, <2 x i16> poison, <2 x i32> zeroinitializer
+ %v = add <2 x i16> %splat.x, %splat.y
+ ret <2 x i16> %v
+}
+
+define <4 x i16> @v4i16(i16 %x, i16 %y) {
+; CHECK-LABEL: v4i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT: vmv.v.x v8, a0
+; CHECK-NEXT: vadd.vx v9, v8, a1
+; CHECK-NEXT: vrgather.vi v8, v9, 0
+; CHECK-NEXT: ret
+ %head.x = insertelement <4 x i16> poison, i16 %x, i32 0
+ %splat.x = shufflevector <4 x i16> %head.x, <4 x i16> poison, <4 x i32> zeroinitializer
+ %head.y = insertelement <4 x i16> poison, i16 %y, i32 0
+ %splat.y = shufflevector <4 x i16> %head.y, <4 x i16> poison, <4 x i32> zeroinitializer
+ %v = add <4 x i16> %splat.x, %splat.y
+ ret <4 x i16> %v
+}
+
+define <8 x i16> @v8i16(i16 %x, i16 %y) {
+; CHECK-LABEL: v8i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT: vmv.v.x v8, a0
+; CHECK-NEXT: vadd.vx v9, v8, a1
+; CHECK-NEXT: vrgather.vi v8, v9, 0
+; CHECK-NEXT: ret
+ %head.x = insertelement <8 x i16> poison, i16 %x, i32 0
+ %splat.x = shufflevector <8 x i16> %head.x, <8 x i16> poison, <8 x i32> zeroinitializer
+ %head.y = insertelement <8 x i16> poison, i16 %y, i32 0
+ %splat.y = shufflevector <8 x i16> %head.y, <8 x i16> poison, <8 x i32> zeroinitializer
+ %v = add <8 x i16> %splat.x, %splat.y
+ ret <8 x i16> %v
+}
+
+define <16 x i16> @v16i16(i16 %x, i16 %y) {
+; CHECK-LABEL: v16i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v8, a0
+; CHECK-NEXT: vadd.vx v10, v8, a1
+; CHECK-NEXT: vrgather.vi v8, v10, 0
+; CHECK-NEXT: ret
+ %head.x = insertelement <16 x i16> poison, i16 %x, i32 0
+ %splat.x = shufflevector <16 x i16> %head.x, <16 x i16> poison, <16 x i32> zeroinitializer
+ %head.y = insertelement <16 x i16> poison, i16 %y, i32 0
+ %splat.y = shufflevector <16 x i16> %head.y, <16 x i16> poison, <16 x i32> zeroinitializer
+ %v = add <16 x i16> %splat.x, %splat.y
+ ret <16 x i16> %v
+}
+
+define <32 x i16> @v32i16(i16 %x, i16 %y) {
+; CHECK-LABEL: v32i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a2, 32
+; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma
+; CHECK-NEXT: vmv.v.x v8, a0
+; CHECK-NEXT: vadd.vx v12, v8, a1
+; CHECK-NEXT: vrgather.vi v8, v12, 0
+; CHECK-NEXT: ret
+ %head.x = insertelement <32 x i16> poison, i16 %x, i32 0
+ %splat.x = shufflevector <32 x i16> %head.x, <32 x i16> poison, <32 x i32> zeroinitializer
+ %head.y = insertelement <32 x i16> poison, i16 %y, i32 0
+ %splat.y = shufflevector <32 x i16> %head.y, <32 x i16> poison, <32 x i32> zeroinitializer
+ %v = add <32 x i16> %splat.x, %splat.y
+ ret <32 x i16> %v
+}
+
+define <1 x i32> @v1i32(i32 %x, i32 %y) {
+; CHECK-LABEL: v1i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT: vmv.s.x v8, a0
+; CHECK-NEXT: vadd.vx v8, v8, a1
+; CHECK-NEXT: ret
+ %head.x = insertelement <1 x i32> poison, i32 %x, i32 0
+ %splat.x = shufflevector <1 x i32> %head.x, <1 x i32> poison, <1 x i32> zeroinitializer
+ %head.y = insertelement <1 x i32> poison, i32 %y, i32 0
+ %splat.y = shufflevector <1 x i32> %head.y, <1 x i32> poison, <1 x i32> zeroinitializer
+ %v = add <1 x i32> %splat.x, %splat.y
+ ret <1 x i32> %v
+}
+
+define <2 x i32> @v2i32(i32 %x, i32 %y) {
+; CHECK-LABEL: v2i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT: vmv.v.x v8, a0
+; CHECK-NEXT: vadd.vx v9, v8, a1
+; CHECK-NEXT: vrgather.vi v8, v9, 0
+; CHECK-NEXT: ret
+ %head.x = insertelement <2 x i32> poison, i32 %x, i32 0
+ %splat.x = shufflevector <2 x i32> %head.x, <2 x i32> poison, <2 x i32> zeroinitializer
+ %head.y = insertelement <2 x i32> poison, i32 %y, i32 0
+ %splat.y = shufflevector <2 x i32> %head.y, <2 x i32> poison, <2 x i32> zeroinitializer
+ %v = add <2 x i32> %splat.x, %splat.y
+ ret <2 x i32> %v
+}
+
+define <4 x i32> @v4i32(i32 %x, i32 %y) {
+; CHECK-LABEL: v4i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vmv.v.x v8, a0
+; CHECK-NEXT: vadd.vx v9, v8, a1
+; CHECK-NEXT: vrgather.vi v8, v9, 0
+; CHECK-NEXT: ret
+ %head.x = insertelement <4 x i32> poison, i32 %x, i32 0
+ %splat.x = shufflevector <4 x i32> %head.x, <4 x i32> poison, <4 x i32> zeroinitializer
+ %head.y = insertelement <4 x i32> poison, i32 %y, i32 0
+ %splat.y = shufflevector <4 x i32> %head.y, <4 x i32> poison, <4 x i32> zeroinitializer
+ %v = add <4 x i32> %splat.x, %splat.y
+ ret <4 x i32> %v
+}
+
+define <8 x i32> @v8i32(i32 %x, i32 %y) {
+; CHECK-LABEL: v8i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT: vmv.v.x v8, a0
+; CHECK-NEXT: vadd.vx v10, v8, a1
+; CHECK-NEXT: vrgather.vi v8, v10, 0
+; CHECK-NEXT: ret
+ %head.x = insertelement <8 x i32> poison, i32 %x, i32 0
+ %splat.x = shufflevector <8 x i32> %head.x, <8 x i32> poison, <8 x i32> zeroinitializer
+ %head.y = insertelement <8 x i32> poison, i32 %y, i32 0
+ %splat.y = shufflevector <8 x i32> %head.y, <8 x i32> poison, <8 x i32> zeroinitializer
+ %v = add <8 x i32> %splat.x, %splat.y
+ ret <8 x i32> %v
+}
+
+define <16 x i32> @v16i32(i32 %x, i32 %y) {
+; CHECK-LABEL: v16i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT: vmv.v.x v8, a0
+; CHECK-NEXT: vadd.vx v12, v8, a1
+; CHECK-NEXT: vrgather.vi v8, v12, 0
+; CHECK-NEXT: ret
+ %head.x = insertelement <16 x i32> poison, i32 %x, i32 0
+ %splat.x = shufflevector <16 x i32> %head.x, <16 x i32> poison, <16 x i32> zeroinitializer
+ %head.y = insertelement <16 x i32> poison, i32 %y, i32 0
+ %splat.y = shufflevector <16 x i32> %head.y, <16 x i32> poison, <16 x i32> zeroinitializer
+ %v = add <16 x i32> %splat.x, %splat.y
+ ret <16 x i32> %v
+}
+
+define <1 x i64> @v1i64(i64 %x, i64 %y) {
+; RV32-LABEL: v1i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw a1, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
+; RV32-NEXT: vlse64.v v8, (a0), zero
+; RV32-NEXT: sw a3, 4(sp)
+; RV32-NEXT: sw a2, 0(sp)
+; RV32-NEXT: mv a0, sp
+; RV32-NEXT: vlse64.v v9, (a0), zero
+; RV32-NEXT: vadd.vv v8, v8, v9
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: v1i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
+; RV64-NEXT: vmv.s.x v8, a0
+; RV64-NEXT: vadd.vx v8, v8, a1
+; RV64-NEXT: ret
+ %head.x = insertelement <1 x i64> poison, i64 %x, i32 0
+ %splat.x = shufflevector <1 x i64> %head.x, <1 x i64> poison, <1 x i32> zeroinitializer
+ %head.y = insertelement <1 x i64> poison, i64 %y, i32 0
+ %splat.y = shufflevector <1 x i64> %head.y, <1 x i64> poison, <1 x i32> zeroinitializer
+ %v = add <1 x i64> %splat.x, %splat.y
+ ret <1 x i64> %v
+}
+
+define <2 x i64> @v2i64(i64 %x, i64 %y) {
+; RV32-LABEL: v2i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw a1, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT: vlse64.v v8, (a0), zero
+; RV32-NEXT: sw a3, 4(sp)
+; RV32-NEXT: sw a2, 0(sp)
+; RV32-NEXT: mv a0, sp
+; RV32-NEXT: vlse64.v v9, (a0), zero
+; RV32-NEXT: vadd.vv v9, v8, v9
+; RV32-NEXT: vrgather.vi v8, v9, 0
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: v2i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; RV64-NEXT: vmv.v.x v8, a0
+; RV64-NEXT: vadd.vx v9, v8, a1
+; RV64-NEXT: vrgather.vi v8, v9, 0
+; RV64-NEXT: ret
+ %head.x = insertelement <2 x i64> poison, i64 %x, i32 0
+ %splat.x = shufflevector <2 x i64> %head.x, <2 x i64> poison, <2 x i32> zeroinitializer
+ %head.y = insertelement <2 x i64> poison, i64 %y, i32 0
+ %splat.y = shufflevector <2 x i64> %head.y, <2 x i64> poison, <2 x i32> zeroinitializer
+ %v = add <2 x i64> %splat.x, %splat.y
+ ret <2 x i64> %v
+}
+
+define <4 x i64> @v4i64(i64 %x, i64 %y) {
+; RV32-LABEL: v4i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw a1, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT: vlse64.v v8, (a0), zero
+; RV32-NEXT: sw a3, 4(sp)
+; RV32-NEXT: sw a2, 0(sp)
+; RV32-NEXT: mv a0, sp
+; RV32-NEXT: vlse64.v v10, (a0), zero
+; RV32-NEXT: vadd.vv v10, v8, v10
+; RV32-NEXT: vrgather.vi v8, v10, 0
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: v4i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; RV64-NEXT: vmv.v.x v8, a0
+; RV64-NEXT: vadd.vx v10, v8, a1
+; RV64-NEXT: vrgather.vi v8, v10, 0
+; RV64-NEXT: ret
+ %head.x = insertelement <4 x i64> poison, i64 %x, i32 0
+ %splat.x = shufflevector <4 x i64> %head.x, <4 x i64> poison, <4 x i32> zeroinitializer
+ %head.y = insertelement <4 x i64> poison, i64 %y, i32 0
+ %splat.y = shufflevector <4 x i64> %head.y, <4 x i64> poison, <4 x i32> zeroinitializer
+ %v = add <4 x i64> %splat.x, %splat.y
+ ret <4 x i64> %v
+}
+
+define <8 x i64> @v8i64(i64 %x, i64 %y) {
+; RV32-LABEL: v8i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: sw a1, 12(sp)
+; RV32-NEXT: sw a0, 8(sp)
+; RV32-NEXT: addi a0, sp, 8
+; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT: vlse64.v v8, (a0), zero
+; RV32-NEXT: sw a3, 4(sp)
+; RV32-NEXT: sw a2, 0(sp)
+; RV32-NEXT: mv a0, sp
+; RV32-NEXT: vlse64.v v12, (a0), zero
+; RV32-NEXT: vadd.vv v12, v8, v12
+; RV32-NEXT: vrgather.vi v8, v12, 0
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: v8i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
+; RV64-NEXT: vmv.v.x v8, a0
+; RV64-NEXT: vadd.vx v12, v8, a1
+; RV64-NEXT: vrgather.vi v8, v12, 0
+; RV64-NEXT: ret
+ %head.x = insertelement <8 x i64> poison, i64 %x, i32 0
+ %splat.x = shufflevector <8 x i64> %head.x, <8 x i64> poison, <8 x i32> zeroinitializer
+ %head.y = insertelement <8 x i64> poison, i64 %y, i32 0
+ %splat.y = shufflevector <8 x i64> %head.y, <8 x i64> poison, <8 x i32> zeroinitializer
+ %v = add <8 x i64> %splat.x, %splat.y
+ ret <8 x i64> %v
+}
+
>From 59caba76a16b0f1c1697478d46717361fb761094 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Fri, 8 Sep 2023 12:21:44 +0100
Subject: [PATCH 2/2] [DAGCombiner] Treat extracts from build_vectors that are
splats as free
When scalarizing bin ops of splats, we treat the extract as free for a
splat_vector because its operand is already scalar, i.e.
(extract idx, (splat_vector x)) -> x.
The same also applies for a build_vector that's a splat:
(extract idx, (build_vector x x x x)) -> x.
This patch takes this into account, which enables scalarization for fixed
length vectors, since the current canonical form for a splatted fixed length
vector is still build_vector.
This improves what we were seeing on RISC-V in #65068, but unfortunately causes
some patterns to be missed on other targets. One big one is that on AArch64
and X86 scalarizing (xor (splat x), (splat -1)) to (splat (xor x, -1)) prevents
vnot from being matched, which for example prevents bif from being matched.
Posting this patch as a WIP to show my findings.
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 16 +-
llvm/test/CodeGen/AArch64/active_lane_mask.ll | 18 +-
.../CodeGen/AArch64/arm64-neon-select_cc.ll | 6 +-
.../test/CodeGen/AArch64/arm64-vector-ldst.ll | 20 +-
llvm/test/CodeGen/AArch64/fdiv-combine.ll | 20 +-
.../AArch64/srem-seteq-illegal-types.ll | 22 +-
.../AArch64/sve-fixed-length-fp-select.ll | 24 +-
.../AArch64/sve-fixed-length-fp-vselect.ll | 6 +-
.../AArch64/sve-fixed-length-int-select.ll | 24 +-
.../AArch64/sve-fixed-length-int-vselect.ll | 6 +-
llvm/test/CodeGen/AArch64/sve-gep.ll | 4 +-
.../build-vector-packed-partial-undef.ll | 13 +-
.../CodeGen/AMDGPU/extract-subvector-16bit.ll | 74 ++--
llvm/test/CodeGen/AMDGPU/trunc-combine.ll | 10 +-
.../ARM/crash-on-pow2-shufflevector.ll | 5 +-
llvm/test/CodeGen/PowerPC/combine-fneg.ll | 24 +-
.../CodeGen/PowerPC/repeated-fp-divisors.ll | 11 +-
.../PowerPC/srem-seteq-illegal-types.ll | 82 ++--
.../RISCV/rvv/fixed-vectors-binop-splats.ll | 114 ++++--
llvm/test/CodeGen/SystemZ/dag-combine-03.ll | 6 +-
llvm/test/CodeGen/SystemZ/pr60413.ll | 9 +-
.../LowOverheadLoops/lsr-profitable-chain.ll | 7 +-
.../Thumb2/mve-gather-scatter-optimisation.ll | 152 +++----
llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll | 4 +-
llvm/test/CodeGen/Thumb2/mve-selectcc.ll | 12 +-
llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll | 22 +-
llvm/test/CodeGen/X86/avx512-bugfix-23634.ll | 21 +-
llvm/test/CodeGen/X86/combine-bitselect.ll | 376 +++++++++++++-----
.../CodeGen/X86/expand-vp-int-intrinsics.ll | 8 +-
.../test/CodeGen/X86/known-signbits-vector.ll | 28 +-
.../test/CodeGen/X86/masked_gather_scatter.ll | 16 +-
.../X86/merge-consecutive-loads-128.ll | 1 +
llvm/test/CodeGen/X86/pr50609.ll | 12 +-
.../CodeGen/X86/srem-seteq-illegal-types.ll | 90 +++--
llvm/test/CodeGen/X86/vector-shuffle-v1.ll | 21 +-
35 files changed, 775 insertions(+), 509 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 82c82c1c19bf082..86c28b31efe8535 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -26196,20 +26196,20 @@ static SDValue scalarizeBinOpOfSplats(SDNode *N, SelectionDAG &DAG,
EVT EltVT = VT.getVectorElementType();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- // TODO: Remove/replace the extract cost check? If the elements are available
- // as scalars, then there may be no extract cost. Should we ask if
- // inserting a scalar back into a vector is cheap instead?
int Index0, Index1;
SDValue Src0 = DAG.getSplatSourceVector(N0, Index0);
SDValue Src1 = DAG.getSplatSourceVector(N1, Index1);
- // Extract element from splat_vector should be free.
- // TODO: use DAG.isSplatValue instead?
- bool IsBothSplatVector = N0.getOpcode() == ISD::SPLAT_VECTOR &&
- N1.getOpcode() == ISD::SPLAT_VECTOR;
+ // Extracting from a shuffle_vector might cost something, but extracting from
+ // a splat_vector or a splatted build_vector should be free since the operands
+ // are scalars anyway.
+ bool IsExtractFree = (N0.getOpcode() == ISD::SPLAT_VECTOR ||
+ N0.getOpcode() == ISD::BUILD_VECTOR) &&
+ (N1.getOpcode() == ISD::SPLAT_VECTOR ||
+ N1.getOpcode() == ISD::BUILD_VECTOR);
if (!Src0 || !Src1 || Index0 != Index1 ||
Src0.getValueType().getVectorElementType() != EltVT ||
Src1.getValueType().getVectorElementType() != EltVT ||
- !(IsBothSplatVector || TLI.isExtractVecEltCheap(VT, Index0)) ||
+ !(IsExtractFree || TLI.isExtractVecEltCheap(VT, Index0)) ||
!TLI.isOperationLegalOrCustom(Opcode, EltVT))
return SDValue();
diff --git a/llvm/test/CodeGen/AArch64/active_lane_mask.ll b/llvm/test/CodeGen/AArch64/active_lane_mask.ll
index e8437b5cd801f5b..b42ba4816f7be82 100644
--- a/llvm/test/CodeGen/AArch64/active_lane_mask.ll
+++ b/llvm/test/CodeGen/AArch64/active_lane_mask.ll
@@ -473,16 +473,16 @@ define <4 x i1> @lane_mask_v4i1_i8(i8 %index, i8 %TC) {
define <2 x i1> @lane_mask_v2i1_i8(i8 %index, i8 %TC) {
; CHECK-LABEL: lane_mask_v2i1_i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi d0, #0x0000ff000000ff
-; CHECK-NEXT: dup v1.2s, w0
+; CHECK-NEXT: and w8, w0, #0xff
+; CHECK-NEXT: movi d2, #0x0000ff000000ff
+; CHECK-NEXT: dup v0.2s, w8
; CHECK-NEXT: adrp x8, .LCPI27_0
-; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI27_0]
-; CHECK-NEXT: dup v3.2s, w1
-; CHECK-NEXT: and v1.8b, v1.8b, v0.8b
-; CHECK-NEXT: add v1.2s, v1.2s, v2.2s
-; CHECK-NEXT: umin v1.2s, v1.2s, v0.2s
-; CHECK-NEXT: and v0.8b, v3.8b, v0.8b
-; CHECK-NEXT: cmhi v0.2s, v0.2s, v1.2s
+; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI27_0]
+; CHECK-NEXT: and w8, w1, #0xff
+; CHECK-NEXT: add v0.2s, v0.2s, v1.2s
+; CHECK-NEXT: dup v1.2s, w8
+; CHECK-NEXT: umin v0.2s, v0.2s, v2.2s
+; CHECK-NEXT: cmhi v0.2s, v1.2s, v0.2s
; CHECK-NEXT: ret
%active.lane.mask = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i8(i8 %index, i8 %TC)
ret <2 x i1> %active.lane.mask
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-select_cc.ll b/llvm/test/CodeGen/AArch64/arm64-neon-select_cc.ll
index cad3fb58086d696..6e8daef96549dff 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-select_cc.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-select_cc.ll
@@ -267,8 +267,12 @@ define <2 x i32> @test_select_cc_v2i32_icmpi1(i1 %cc, <2 x i32> %a, <2 x i32> %b
; CHECK: // %bb.0:
; CHECK-NEXT: tst w0, #0x1
; CHECK-NEXT: csetm w8, ne
+; CHECK-NEXT: mvn w9, w8
; CHECK-NEXT: dup v2.2s, w8
-; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b
+; CHECK-NEXT: dup v3.2s, w9
+; CHECK-NEXT: and v0.8b, v0.8b, v2.8b
+; CHECK-NEXT: and v1.8b, v1.8b, v3.8b
+; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b
; CHECK-NEXT: ret
%cmp = icmp ne i1 %cc, 0
%e = select i1 %cmp, <2 x i32> %a, <2 x i32> %b
diff --git a/llvm/test/CodeGen/AArch64/arm64-vector-ldst.ll b/llvm/test/CodeGen/AArch64/arm64-vector-ldst.ll
index 3542b26b53539cd..eb16b80ec316c60 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vector-ldst.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vector-ldst.ll
@@ -593,8 +593,9 @@ entry:
define <2 x i32> @fct20(ptr nocapture %sp0) {
; CHECK-LABEL: fct20:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldr s0, [x0, #4]
-; CHECK-NEXT: mul.2s v0, v0, v0
+; CHECK-NEXT: ldr w8, [x0, #4]
+; CHECK-NEXT: mul w8, w8, w8
+; CHECK-NEXT: fmov s0, w8
; CHECK-NEXT: ret
entry:
%addr = getelementptr i32, ptr %sp0, i64 1
@@ -607,8 +608,9 @@ entry:
define <4 x i32> @fct21(ptr nocapture %sp0) {
; CHECK-LABEL: fct21:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldr s0, [x0, #4]
-; CHECK-NEXT: mul.4s v0, v0, v0
+; CHECK-NEXT: ldr w8, [x0, #4]
+; CHECK-NEXT: mul w8, w8, w8
+; CHECK-NEXT: fmov s0, w8
; CHECK-NEXT: ret
entry:
%addr = getelementptr i32, ptr %sp0, i64 1
@@ -703,8 +705,9 @@ entry:
define <2 x i32> @fct28(ptr nocapture %sp0, i64 %offset) {
; CHECK-LABEL: fct28:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldr s0, [x0, x1, lsl #2]
-; CHECK-NEXT: mul.2s v0, v0, v0
+; CHECK-NEXT: ldr w8, [x0, x1, lsl #2]
+; CHECK-NEXT: mul w8, w8, w8
+; CHECK-NEXT: fmov s0, w8
; CHECK-NEXT: ret
entry:
%addr = getelementptr i32, ptr %sp0, i64 %offset
@@ -717,8 +720,9 @@ entry:
define <4 x i32> @fct29(ptr nocapture %sp0, i64 %offset) {
; CHECK-LABEL: fct29:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldr s0, [x0, x1, lsl #2]
-; CHECK-NEXT: mul.4s v0, v0, v0
+; CHECK-NEXT: ldr w8, [x0, x1, lsl #2]
+; CHECK-NEXT: mul w8, w8, w8
+; CHECK-NEXT: fmov s0, w8
; CHECK-NEXT: ret
entry:
%addr = getelementptr i32, ptr %sp0, i64 %offset
diff --git a/llvm/test/CodeGen/AArch64/fdiv-combine.ll b/llvm/test/CodeGen/AArch64/fdiv-combine.ll
index 1ed63f3ef250777..7f3d96388aefa2f 100644
--- a/llvm/test/CodeGen/AArch64/fdiv-combine.ll
+++ b/llvm/test/CodeGen/AArch64/fdiv-combine.ll
@@ -100,13 +100,11 @@ define void @two_fdiv_double(double %D, double %a, double %b) #0 {
define void @splat_three_fdiv_4xfloat(float %D, <4 x float> %a, <4 x float> %b, <4 x float> %c) #0 {
; CHECK-LABEL: splat_three_fdiv_4xfloat:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0
-; CHECK-NEXT: fmov v4.4s, #1.00000000
-; CHECK-NEXT: dup v0.4s, v0.s[0]
-; CHECK-NEXT: fdiv v4.4s, v4.4s, v0.4s
-; CHECK-NEXT: fmul v0.4s, v1.4s, v4.4s
-; CHECK-NEXT: fmul v1.4s, v2.4s, v4.4s
-; CHECK-NEXT: fmul v2.4s, v3.4s, v4.4s
+; CHECK-NEXT: fmov s4, #1.00000000
+; CHECK-NEXT: fdiv s4, s4, s0
+; CHECK-NEXT: fmul v0.4s, v1.4s, v4.s[0]
+; CHECK-NEXT: fmul v1.4s, v2.4s, v4.s[0]
+; CHECK-NEXT: fmul v2.4s, v3.4s, v4.s[0]
; CHECK-NEXT: b foo_3_4xf
%D.ins = insertelement <4 x float> poison, float %D, i64 0
%splat = shufflevector <4 x float> %D.ins, <4 x float> poison, <4 x i32> zeroinitializer
@@ -120,11 +118,9 @@ define void @splat_three_fdiv_4xfloat(float %D, <4 x float> %a, <4 x float> %b,
define <4 x float> @splat_fdiv_v4f32(float %D, <4 x float> %a) #1 {
; CHECK-LABEL: splat_fdiv_v4f32:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0
-; CHECK-NEXT: fmov v2.4s, #1.00000000
-; CHECK-NEXT: dup v0.4s, v0.s[0]
-; CHECK-NEXT: fdiv v0.4s, v2.4s, v0.4s
-; CHECK-NEXT: fmul v0.4s, v1.4s, v0.4s
+; CHECK-NEXT: fmov s2, #1.00000000
+; CHECK-NEXT: fdiv s0, s2, s0
+; CHECK-NEXT: fmul v0.4s, v1.4s, v0.s[0]
; CHECK-NEXT: ret
entry:
%D.ins = insertelement <4 x float> poison, float %D, i64 0
diff --git a/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll
index 595991e86a91c70..28bdca6cdad1b2b 100644
--- a/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll
@@ -76,28 +76,28 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind {
; CHECK-NEXT: add x11, x11, x11, lsl #3
; CHECK-NEXT: add x9, x9, x9, lsl #3
; CHECK-NEXT: sub x8, x8, x11
-; CHECK-NEXT: sub x11, x13, x12
+; CHECK-NEXT: sub x12, x13, x12
; CHECK-NEXT: fmov d0, x8
; CHECK-NEXT: mov x8, #8589934591 // =0x1ffffffff
; CHECK-NEXT: sub x9, x10, x9
-; CHECK-NEXT: asr x10, x11, #3
+; CHECK-NEXT: lsr x10, x12, #3
; CHECK-NEXT: dup v1.2d, x8
; CHECK-NEXT: mov v0.d[1], x9
-; CHECK-NEXT: add x9, x10, x11, lsr #63
+; CHECK-NEXT: add x9, x10, x12, lsr #63
; CHECK-NEXT: add x8, x9, x9, lsl #3
; CHECK-NEXT: adrp x9, .LCPI3_0
-; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI3_0]
-; CHECK-NEXT: add x8, x12, x8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: fmov d3, x8
+; CHECK-NEXT: add x8, x2, x8
+; CHECK-NEXT: ldr q1, [x9, :lo12:.LCPI3_0]
+; CHECK-NEXT: and x8, x8, #0x1ffffffff
+; CHECK-NEXT: dup v2.2d, x8
; CHECK-NEXT: adrp x8, .LCPI3_1
-; CHECK-NEXT: cmeq v0.2d, v0.2d, v2.2d
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_1]
-; CHECK-NEXT: and v1.16b, v3.16b, v1.16b
+; CHECK-NEXT: cmeq v0.2d, v0.2d, v1.2d
+; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_1]
+; CHECK-NEXT: cmeq v1.2d, v2.2d, v1.2d
; CHECK-NEXT: mvn v0.16b, v0.16b
-; CHECK-NEXT: cmeq v1.2d, v1.2d, v2.2d
-; CHECK-NEXT: xtn v0.2s, v0.2d
; CHECK-NEXT: mvn v1.16b, v1.16b
+; CHECK-NEXT: xtn v0.2s, v0.2d
; CHECK-NEXT: xtn v1.2s, v1.2d
; CHECK-NEXT: mov w1, v0.s[1]
; CHECK-NEXT: fmov w0, s0
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll
index 13ebda1df7f9d1f..0890c6c2581d602 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll
@@ -135,8 +135,12 @@ define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, i1 %mask) v
; CHECK: // %bb.0:
; CHECK-NEXT: tst w0, #0x1
; CHECK-NEXT: csetm w8, ne
+; CHECK-NEXT: mvn w9, w8
; CHECK-NEXT: dup v2.2s, w8
-; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b
+; CHECK-NEXT: dup v3.2s, w9
+; CHECK-NEXT: and v0.8b, v0.8b, v2.8b
+; CHECK-NEXT: and v1.8b, v1.8b, v3.8b
+; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b
; CHECK-NEXT: ret
%sel = select i1 %mask, <2 x float> %op1, <2 x float> %op2
ret <2 x float> %sel
@@ -148,8 +152,12 @@ define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, i1 %mask) v
; CHECK: // %bb.0:
; CHECK-NEXT: tst w0, #0x1
; CHECK-NEXT: csetm w8, ne
+; CHECK-NEXT: mvn w9, w8
; CHECK-NEXT: dup v2.4s, w8
-; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT: dup v3.4s, w9
+; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: and v1.16b, v1.16b, v3.16b
+; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%sel = select i1 %mask, <4 x float> %op1, <4 x float> %op2
ret <4 x float> %sel
@@ -259,8 +267,12 @@ define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, i1 %mask
; CHECK: // %bb.0:
; CHECK-NEXT: tst w0, #0x1
; CHECK-NEXT: csetm x8, ne
+; CHECK-NEXT: mvn x9, x8
; CHECK-NEXT: fmov d2, x8
-; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b
+; CHECK-NEXT: fmov d3, x9
+; CHECK-NEXT: and v0.8b, v0.8b, v2.8b
+; CHECK-NEXT: and v1.8b, v1.8b, v3.8b
+; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b
; CHECK-NEXT: ret
%sel = select i1 %mask, <1 x double> %op1, <1 x double> %op2
ret <1 x double> %sel
@@ -272,8 +284,12 @@ define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, i1 %mask
; CHECK: // %bb.0:
; CHECK-NEXT: tst w0, #0x1
; CHECK-NEXT: csetm x8, ne
+; CHECK-NEXT: mvn x9, x8
; CHECK-NEXT: dup v2.2d, x8
-; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT: dup v3.2d, x9
+; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: and v1.16b, v1.16b, v3.16b
+; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%sel = select i1 %mask, <2 x double> %op1, <2 x double> %op2
ret <2 x double> %sel
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll
index 3566bbc2b456141..45f100a3b68e194 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll
@@ -237,8 +237,12 @@ define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x i1>
; CHECK: // %bb.0:
; CHECK-NEXT: tst w0, #0x1
; CHECK-NEXT: csetm x8, ne
+; CHECK-NEXT: mvn x9, x8
; CHECK-NEXT: fmov d2, x8
-; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b
+; CHECK-NEXT: fmov d3, x9
+; CHECK-NEXT: and v0.8b, v0.8b, v2.8b
+; CHECK-NEXT: and v1.8b, v1.8b, v3.8b
+; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b
; CHECK-NEXT: ret
%sel = select <1 x i1> %mask, <1 x double> %op1, <1 x double> %op2
ret <1 x double> %sel
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll
index 710dce4de6dda3a..6809b8f69bda05f 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll
@@ -254,8 +254,12 @@ define <2 x i32> @select_v2i32(<2 x i32> %op1, <2 x i32> %op2, i1 %mask) vscale_
; CHECK: // %bb.0:
; CHECK-NEXT: tst w0, #0x1
; CHECK-NEXT: csetm w8, ne
+; CHECK-NEXT: mvn w9, w8
; CHECK-NEXT: dup v2.2s, w8
-; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b
+; CHECK-NEXT: dup v3.2s, w9
+; CHECK-NEXT: and v0.8b, v0.8b, v2.8b
+; CHECK-NEXT: and v1.8b, v1.8b, v3.8b
+; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b
; CHECK-NEXT: ret
%sel = select i1 %mask, <2 x i32> %op1, <2 x i32> %op2
ret <2 x i32> %sel
@@ -267,8 +271,12 @@ define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, i1 %mask) vscale_
; CHECK: // %bb.0:
; CHECK-NEXT: tst w0, #0x1
; CHECK-NEXT: csetm w8, ne
+; CHECK-NEXT: mvn w9, w8
; CHECK-NEXT: dup v2.4s, w8
-; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT: dup v3.4s, w9
+; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: and v1.16b, v1.16b, v3.16b
+; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%sel = select i1 %mask, <4 x i32> %op1, <4 x i32> %op2
ret <4 x i32> %sel
@@ -378,8 +386,12 @@ define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, i1 %mask) vscale_
; CHECK: // %bb.0:
; CHECK-NEXT: tst w0, #0x1
; CHECK-NEXT: csetm x8, ne
+; CHECK-NEXT: mvn x9, x8
; CHECK-NEXT: fmov d2, x8
-; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b
+; CHECK-NEXT: fmov d3, x9
+; CHECK-NEXT: and v0.8b, v0.8b, v2.8b
+; CHECK-NEXT: and v1.8b, v1.8b, v3.8b
+; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b
; CHECK-NEXT: ret
%sel = select i1 %mask, <1 x i64> %op1, <1 x i64> %op2
ret <1 x i64> %sel
@@ -391,8 +403,12 @@ define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, i1 %mask) vscale_
; CHECK: // %bb.0:
; CHECK-NEXT: tst w0, #0x1
; CHECK-NEXT: csetm x8, ne
+; CHECK-NEXT: mvn x9, x8
; CHECK-NEXT: dup v2.2d, x8
-; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT: dup v3.2d, x9
+; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: and v1.16b, v1.16b, v3.16b
+; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%sel = select i1 %mask, <2 x i64> %op1, <2 x i64> %op2
ret <2 x i64> %sel
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll
index 2c6ffeaeefd6d95..f13abd56ea07e72 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll
@@ -349,8 +349,12 @@ define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, <1 x i1> %mask) v
; CHECK: // %bb.0:
; CHECK-NEXT: tst w0, #0x1
; CHECK-NEXT: csetm x8, ne
+; CHECK-NEXT: mvn x9, x8
; CHECK-NEXT: fmov d2, x8
-; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b
+; CHECK-NEXT: fmov d3, x9
+; CHECK-NEXT: and v0.8b, v0.8b, v2.8b
+; CHECK-NEXT: and v1.8b, v1.8b, v3.8b
+; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b
; CHECK-NEXT: ret
%sel = select <1 x i1> %mask, <1 x i64> %op1, <1 x i64> %op2
ret <1 x i64> %sel
diff --git a/llvm/test/CodeGen/AArch64/sve-gep.ll b/llvm/test/CodeGen/AArch64/sve-gep.ll
index fd93e43613c52c2..d27fd7727f0b4fc 100644
--- a/llvm/test/CodeGen/AArch64/sve-gep.ll
+++ b/llvm/test/CodeGen/AArch64/sve-gep.ll
@@ -33,10 +33,8 @@ define <vscale x 2 x i32>* @scalar_of_scalable_3(<vscale x 2 x i32>* %base, i64
define <2 x <vscale x 2 x i64>*> @fixed_of_scalable_1(<vscale x 2 x i64>* %base) {
; CHECK-LABEL: fixed_of_scalable_1:
; CHECK: // %bb.0:
-; CHECK-NEXT: rdvl x8, #1
-; CHECK-NEXT: dup v1.2d, x0
+; CHECK-NEXT: addvl x8, x0, #1
; CHECK-NEXT: dup v0.2d, x8
-; CHECK-NEXT: add v0.2d, v1.2d, v0.2d
; CHECK-NEXT: ret
%d = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64>* %base, <2 x i64> <i64 1, i64 1>
ret <2 x <vscale x 2 x i64>*> %d
diff --git a/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll b/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll
index 46b2f82d9de2a3f..432ba48da14f9b4 100644
--- a/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll
@@ -244,7 +244,8 @@ define void @undef_hi_op_v2f16(half %arg0) {
; GFX9-LABEL: undef_hi_op_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0]
+; GFX9-NEXT: v_add_f16_e32 v0, 1.0, v0
+; GFX9-NEXT: v_pack_b32_f16 v0, v0, v0
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; use v0
; GFX9-NEXT: ;;#ASMEND
@@ -254,7 +255,8 @@ define void @undef_hi_op_v2f16(half %arg0) {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_f16_e32 v0, 1.0, v0
-; GFX8-NEXT: v_or_b32_e32 v0, 0x7e000000, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: ;;#ASMSTART
; GFX8-NEXT: ; use v0
; GFX8-NEXT: ;;#ASMEND
@@ -269,8 +271,9 @@ define void @undef_hi_op_v2i16(i16 %arg0) {
; GFX9-LABEL: undef_hi_op_v2i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_movk_i32 s4, 0x63
-; GFX9-NEXT: v_pk_add_u16 v0, v0, s4 op_sel_hi:[1,0]
+; GFX9-NEXT: v_add_u16_e32 v0, 0x63, v0
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
+; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; use v0
; GFX9-NEXT: ;;#ASMEND
@@ -280,6 +283,8 @@ define void @undef_hi_op_v2i16(i16 %arg0) {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u16_e32 v0, 0x63, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: ;;#ASMSTART
; GFX8-NEXT: ; use v0
; GFX8-NEXT: ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
index 7c469c9f4ccae45..1bf5c1348cdbef8 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
@@ -104,14 +104,15 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1)
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: .LBB0_4: ; %exit
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v3 op_sel_hi:[0,0]
-; GFX9-NEXT: s_movk_i32 s4, 0x8000
+; GFX9-NEXT: v_ashrrev_i16_e32 v0, 15, v3
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
+; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT: s_movk_i32 s5, 0x8000
; GFX9-NEXT: v_or_b32_e32 v1, 0xffff8000, v0
-; GFX9-NEXT: v_or_b32_sdwa v3, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v3, v0, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1]
; GFX9-NEXT: v_or_b32_e32 v2, 0xffff8000, v0
-; GFX9-NEXT: v_or_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: s_mov_b32 s4, 0x5040100
+; GFX9-NEXT: v_or_b32_sdwa v0, v0, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4
; GFX9-NEXT: v_perm_b32 v1, v3, v1, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -132,19 +133,21 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1)
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: .LBB0_4: ; %exit
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_ashrrev_i16 v1, 15, v3 op_sel_hi:[0,0]
+; GFX11-NEXT: v_ashrrev_i16 v0, 15, v3
+; GFX11-NEXT: v_pk_ashrrev_i16 v1, 15, v2 op_sel_hi:[0,1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-NEXT: v_or_b32_e32 v1, 0xffff8000, v1
-; GFX11-NEXT: v_or_b32_e32 v0, 0xffff8000, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_or_b32_e32 v2, 0xffff8000, v2
-; GFX11-NEXT: v_or_b32_e32 v3, 0xffff8000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100
+; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x5040100
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT: v_or_b32_e32 v4, 0xffff8000, v0
+; GFX11-NEXT: v_or_b32_e32 v0, 0xffff8000, v1
+; GFX11-NEXT: v_or_b32_e32 v1, 0xffff8000, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v2, 0xffff8000, v3
+; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_perm_b32 v1, v2, v4, 0x5040100
; GFX11-NEXT: s_setpc_b64 s[30:31]
br i1 undef, label %T, label %F
@@ -634,14 +637,15 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1
; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1
; GFX9-NEXT: .LBB3_4: ; %exit
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v5 op_sel_hi:[0,0]
-; GFX9-NEXT: s_movk_i32 s4, 0x8000
+; GFX9-NEXT: v_ashrrev_i16_e32 v0, 15, v5
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
+; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4
+; GFX9-NEXT: s_movk_i32 s5, 0x8000
; GFX9-NEXT: v_or_b32_e32 v1, 0xffff8000, v0
-; GFX9-NEXT: v_or_b32_sdwa v2, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_or_b32_sdwa v2, v0, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1]
; GFX9-NEXT: v_or_b32_e32 v3, 0xffff8000, v0
-; GFX9-NEXT: v_or_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: s_mov_b32 s4, 0x5040100
+; GFX9-NEXT: v_or_b32_sdwa v0, v0, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
; GFX9-NEXT: v_perm_b32 v1, v2, v1, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -666,19 +670,21 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: .LBB3_4: ; %exit
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1]
-; GFX11-NEXT: v_pk_ashrrev_i16 v1, 15, v3 op_sel_hi:[0,0]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-NEXT: v_or_b32_e32 v1, 0xffff8000, v1
-; GFX11-NEXT: v_or_b32_e32 v0, 0xffff8000, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_or_b32_e32 v2, 0xffff8000, v2
-; GFX11-NEXT: v_or_b32_e32 v3, 0xffff8000, v3
+; GFX11-NEXT: v_ashrrev_i16 v0, 15, v3
+; GFX11-NEXT: v_pk_ashrrev_i16 v1, 15, v2 op_sel_hi:[0,1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100
+; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x5040100
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT: v_or_b32_e32 v4, 0xffff8000, v0
+; GFX11-NEXT: v_or_b32_e32 v0, 0xffff8000, v1
+; GFX11-NEXT: v_or_b32_e32 v1, 0xffff8000, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_or_b32_e32 v2, 0xffff8000, v3
+; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_perm_b32 v1, v2, v4, 0x5040100
; GFX11-NEXT: s_setpc_b64 s[30:31]
br i1 undef, label %T, label %F
diff --git a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
index d200b25c17d33bd..70c16d550a208e8 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
@@ -95,9 +95,10 @@ define amdgpu_kernel void @truncate_high_elt_extract_vector(ptr addrspace(1) noc
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_sext_i32_i16 s4, s4
; SI-NEXT: s_sext_i32_i16 s5, s5
-; SI-NEXT: s_mul_i32 s5, s5, s4
-; SI-NEXT: s_lshr_b32 s4, s5, 16
+; SI-NEXT: s_mul_i32 s4, s5, s4
+; SI-NEXT: s_lshr_b32 s5, s4, 16
; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: v_alignbit_b32 v0, s5, v0, 16
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
@@ -113,9 +114,10 @@ define amdgpu_kernel void @truncate_high_elt_extract_vector(ptr addrspace(1) noc
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_sext_i32_i16 s0, s2
; VI-NEXT: s_sext_i32_i16 s1, s3
-; VI-NEXT: s_mul_i32 s1, s1, s0
-; VI-NEXT: s_lshr_b32 s0, s1, 16
+; VI-NEXT: s_mul_i32 s0, s1, s0
+; VI-NEXT: s_lshr_b32 s1, s0, 16
; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_alignbit_b32 v2, s1, v2, 16
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
bb:
diff --git a/llvm/test/CodeGen/ARM/crash-on-pow2-shufflevector.ll b/llvm/test/CodeGen/ARM/crash-on-pow2-shufflevector.ll
index 8186f6c9b42fba8..44d5862fab44f50 100644
--- a/llvm/test/CodeGen/ARM/crash-on-pow2-shufflevector.ll
+++ b/llvm/test/CodeGen/ARM/crash-on-pow2-shufflevector.ll
@@ -8,9 +8,8 @@
define i32 @foo(ptr %descs, i32 %num, i32 %cw) local_unnamed_addr #0 {
; CHECK-LABEL: foo:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldr d16, [r0, #32]
-; CHECK-NEXT: vadd.i32 d16, d16, d16
-; CHECK-NEXT: vmov.32 r0, d16[0]
+; CHECK-NEXT: ldr r0, [r0, #32]
+; CHECK-NEXT: add r0, r0, r0
; CHECK-NEXT: bx lr
entry:
%wide.vec = load <16 x i32>, ptr %descs, align 4
diff --git a/llvm/test/CodeGen/PowerPC/combine-fneg.ll b/llvm/test/CodeGen/PowerPC/combine-fneg.ll
index 3015e68e471a7c5..be7ad8f18178cc7 100644
--- a/llvm/test/CodeGen/PowerPC/combine-fneg.ll
+++ b/llvm/test/CodeGen/PowerPC/combine-fneg.ll
@@ -5,19 +5,17 @@
define <4 x double> @fneg_fdiv_splat(double %a0, <4 x double> %a1) {
; CHECK-LABEL: fneg_fdiv_splat:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: # kill: def $f1 killed $f1 def $vsl1
-; CHECK-NEXT: xxspltd 0, 1, 0
-; CHECK-NEXT: addis 3, 2, .LCPI0_0 at toc@ha
-; CHECK-NEXT: addi 3, 3, .LCPI0_0 at toc@l
-; CHECK-NEXT: lxvd2x 2, 0, 3
-; CHECK-NEXT: xvredp 1, 0
-; CHECK-NEXT: xxlor 3, 2, 2
-; CHECK-NEXT: xvmaddadp 3, 0, 1
-; CHECK-NEXT: xvnmsubadp 1, 1, 3
-; CHECK-NEXT: xvmaddadp 2, 0, 1
-; CHECK-NEXT: xvmsubadp 1, 1, 2
-; CHECK-NEXT: xvmuldp 34, 34, 1
-; CHECK-NEXT: xvmuldp 35, 35, 1
+; CHECK-NEXT: vspltisw 4, 1
+; CHECK-NEXT: xsredp 0, 1
+; CHECK-NEXT: xvcvsxwdp 2, 36
+; CHECK-NEXT: fmr 3, 2
+; CHECK-NEXT: xsnmsubadp 3, 1, 0
+; CHECK-NEXT: xsmaddadp 0, 0, 3
+; CHECK-NEXT: xsnmsubadp 2, 1, 0
+; CHECK-NEXT: xsnmaddadp 0, 0, 2
+; CHECK-NEXT: xxspltd 0, 0, 0
+; CHECK-NEXT: xvmuldp 34, 34, 0
+; CHECK-NEXT: xvmuldp 35, 35, 0
; CHECK-NEXT: blr
entry:
%splat.splatinsert = insertelement <4 x double> undef, double %a0, i32 0
diff --git a/llvm/test/CodeGen/PowerPC/repeated-fp-divisors.ll b/llvm/test/CodeGen/PowerPC/repeated-fp-divisors.ll
index 68db90ad2e19873..b94091f106250e8 100644
--- a/llvm/test/CodeGen/PowerPC/repeated-fp-divisors.ll
+++ b/llvm/test/CodeGen/PowerPC/repeated-fp-divisors.ll
@@ -5,17 +5,16 @@
define <4 x float> @repeated_fp_divisor_noest(float %a, <4 x float> %b) {
; CHECK-LABEL: repeated_fp_divisor_noest:
; CHECK: # %bb.0:
-; CHECK-NEXT: xscvdpspn 0, 1
-; CHECK-NEXT: addis 3, 2, .LCPI0_1 at toc@ha
-; CHECK-NEXT: addi 3, 3, .LCPI0_1 at toc@l
-; CHECK-NEXT: lxvd2x 1, 0, 3
+; CHECK-NEXT: vspltisw 3, 1
; CHECK-NEXT: addis 3, 2, .LCPI0_0 at toc@ha
; CHECK-NEXT: addi 3, 3, .LCPI0_0 at toc@l
-; CHECK-NEXT: xxspltw 0, 0, 0
-; CHECK-NEXT: xvdivsp 0, 1, 0
+; CHECK-NEXT: xvcvsxwdp 0, 35
+; CHECK-NEXT: xsdivsp 0, 0, 1
; CHECK-NEXT: lxvd2x 1, 0, 3
; CHECK-NEXT: xxswapd 35, 1
; CHECK-NEXT: xvmulsp 1, 34, 35
+; CHECK-NEXT: xscvdpspn 0, 0
+; CHECK-NEXT: xxspltw 0, 0, 0
; CHECK-NEXT: xvmulsp 34, 1, 0
; CHECK-NEXT: blr
%ins = insertelement <4 x float> undef, float %a, i32 0
diff --git a/llvm/test/CodeGen/PowerPC/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/PowerPC/srem-seteq-illegal-types.ll
index e7a0f149ac976d9..d78b29415b61a68 100644
--- a/llvm/test/CodeGen/PowerPC/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/PowerPC/srem-seteq-illegal-types.ll
@@ -188,61 +188,61 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind {
; PPC64LE-NEXT: lis 6, 1820
; PPC64LE-NEXT: sldi 3, 3, 31
; PPC64LE-NEXT: ori 6, 6, 29127
-; PPC64LE-NEXT: sldi 4, 4, 31
+; PPC64LE-NEXT: sldi 7, 5, 31
; PPC64LE-NEXT: rldic 6, 6, 34, 3
-; PPC64LE-NEXT: sradi 3, 3, 31
+; PPC64LE-NEXT: sldi 4, 4, 31
; PPC64LE-NEXT: oris 6, 6, 29127
-; PPC64LE-NEXT: sradi 4, 4, 31
-; PPC64LE-NEXT: ori 7, 6, 7282
-; PPC64LE-NEXT: sldi 5, 5, 31
+; PPC64LE-NEXT: sradi 3, 3, 31
+; PPC64LE-NEXT: ori 8, 6, 7282
+; PPC64LE-NEXT: sradi 7, 7, 31
; PPC64LE-NEXT: ori 6, 6, 7281
-; PPC64LE-NEXT: mulhd 8, 3, 7
-; PPC64LE-NEXT: mulhd 7, 4, 7
-; PPC64LE-NEXT: sradi 5, 5, 31
-; PPC64LE-NEXT: mulhd 6, 5, 6
-; PPC64LE-NEXT: rldicl 9, 8, 1, 63
-; PPC64LE-NEXT: rldicl 10, 7, 1, 63
-; PPC64LE-NEXT: add 8, 8, 9
-; PPC64LE-NEXT: add 7, 7, 10
-; PPC64LE-NEXT: sldi 10, 8, 3
-; PPC64LE-NEXT: sub 6, 6, 5
-; PPC64LE-NEXT: add 8, 8, 10
-; PPC64LE-NEXT: sldi 10, 7, 3
-; PPC64LE-NEXT: rldicl 9, 6, 1, 63
-; PPC64LE-NEXT: add 7, 7, 10
-; PPC64LE-NEXT: sub 3, 3, 8
-; PPC64LE-NEXT: addis 8, 2, .LCPI3_1 at toc@ha
+; PPC64LE-NEXT: sradi 4, 4, 31
+; PPC64LE-NEXT: mulhd 9, 3, 8
+; PPC64LE-NEXT: mulhd 8, 4, 8
+; PPC64LE-NEXT: mulhd 6, 7, 6
+; PPC64LE-NEXT: rldicl 10, 9, 1, 63
+; PPC64LE-NEXT: sub 6, 6, 7
+; PPC64LE-NEXT: rldicl 7, 8, 1, 63
+; PPC64LE-NEXT: add 9, 9, 10
+; PPC64LE-NEXT: add 7, 8, 7
+; PPC64LE-NEXT: sldi 8, 9, 3
+; PPC64LE-NEXT: rldicl 10, 6, 1, 63
; PPC64LE-NEXT: sradi 6, 6, 3
-; PPC64LE-NEXT: sub 4, 4, 7
+; PPC64LE-NEXT: add 8, 9, 8
+; PPC64LE-NEXT: sldi 9, 7, 3
+; PPC64LE-NEXT: sub 3, 3, 8
+; PPC64LE-NEXT: add 6, 6, 10
+; PPC64LE-NEXT: add 7, 7, 9
; PPC64LE-NEXT: mtfprd 0, 3
; PPC64LE-NEXT: addis 3, 2, .LCPI3_0 at toc@ha
-; PPC64LE-NEXT: addi 7, 8, .LCPI3_1 at toc@l
-; PPC64LE-NEXT: add 6, 6, 9
-; PPC64LE-NEXT: mtfprd 1, 4
-; PPC64LE-NEXT: addi 3, 3, .LCPI3_0 at toc@l
-; PPC64LE-NEXT: lxvd2x 2, 0, 7
; PPC64LE-NEXT: sldi 8, 6, 3
-; PPC64LE-NEXT: lxvd2x 3, 0, 3
-; PPC64LE-NEXT: add 4, 6, 8
-; PPC64LE-NEXT: addis 6, 2, .LCPI3_2 at toc@ha
+; PPC64LE-NEXT: sub 4, 4, 7
+; PPC64LE-NEXT: addis 7, 2, .LCPI3_1 at toc@ha
+; PPC64LE-NEXT: addi 3, 3, .LCPI3_0 at toc@l
+; PPC64LE-NEXT: add 6, 6, 8
+; PPC64LE-NEXT: mtfprd 1, 4
+; PPC64LE-NEXT: addis 4, 2, .LCPI3_2 at toc@ha
+; PPC64LE-NEXT: addi 7, 7, .LCPI3_1 at toc@l
+; PPC64LE-NEXT: lxvd2x 2, 0, 3
+; PPC64LE-NEXT: add 3, 5, 6
+; PPC64LE-NEXT: addi 4, 4, .LCPI3_2 at toc@l
+; PPC64LE-NEXT: lxvd2x 3, 0, 7
+; PPC64LE-NEXT: clrldi 3, 3, 31
; PPC64LE-NEXT: xxmrghd 34, 1, 0
-; PPC64LE-NEXT: add 3, 5, 4
-; PPC64LE-NEXT: addi 4, 6, .LCPI3_2 at toc@l
-; PPC64LE-NEXT: xxswapd 35, 2
-; PPC64LE-NEXT: mtfprd 0, 3
-; PPC64LE-NEXT: lxvd2x 1, 0, 4
-; PPC64LE-NEXT: xxland 34, 34, 3
+; PPC64LE-NEXT: lxvd2x 0, 0, 4
+; PPC64LE-NEXT: mtfprd 1, 3
+; PPC64LE-NEXT: xxswapd 35, 3
+; PPC64LE-NEXT: xxspltd 37, 1, 0
; PPC64LE-NEXT: xxswapd 36, 0
+; PPC64LE-NEXT: xxland 34, 34, 2
; PPC64LE-NEXT: vcmpequd 2, 2, 3
-; PPC64LE-NEXT: xxswapd 35, 1
-; PPC64LE-NEXT: xxland 36, 36, 3
-; PPC64LE-NEXT: vcmpequd 3, 4, 3
+; PPC64LE-NEXT: vcmpequd 3, 5, 4
; PPC64LE-NEXT: xxlnor 0, 34, 34
+; PPC64LE-NEXT: xxlnor 34, 35, 35
; PPC64LE-NEXT: xxswapd 1, 0
; PPC64LE-NEXT: mffprwz 4, 0
-; PPC64LE-NEXT: xxlnor 34, 35, 35
-; PPC64LE-NEXT: mffprwz 3, 1
; PPC64LE-NEXT: xxswapd 2, 34
+; PPC64LE-NEXT: mffprwz 3, 1
; PPC64LE-NEXT: mffprwz 5, 2
; PPC64LE-NEXT: blr
%srem = srem <3 x i33> %X, <i33 9, i33 9, i33 -9>
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-binop-splats.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-binop-splats.ll
index 309ccd292fb6259..19b46df04ff1d0a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-binop-splats.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-binop-splats.ll
@@ -378,12 +378,19 @@ define <32 x i16> @v32i16(i16 %x, i16 %y) {
}
define <1 x i32> @v1i32(i32 %x, i32 %y) {
-; CHECK-LABEL: v1i32:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT: vmv.s.x v8, a0
-; CHECK-NEXT: vadd.vx v8, v8, a1
-; CHECK-NEXT: ret
+; RV32-LABEL: v1i32:
+; RV32: # %bb.0:
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
+; RV32-NEXT: vmv.s.x v8, a0
+; RV32-NEXT: ret
+;
+; RV64-LABEL: v1i32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
+; RV64-NEXT: vmv.s.x v8, a0
+; RV64-NEXT: vadd.vx v8, v8, a1
+; RV64-NEXT: ret
%head.x = insertelement <1 x i32> poison, i32 %x, i32 0
%splat.x = shufflevector <1 x i32> %head.x, <1 x i32> poison, <1 x i32> zeroinitializer
%head.y = insertelement <1 x i32> poison, i32 %y, i32 0
@@ -393,13 +400,20 @@ define <1 x i32> @v1i32(i32 %x, i32 %y) {
}
define <2 x i32> @v2i32(i32 %x, i32 %y) {
-; CHECK-LABEL: v2i32:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT: vmv.v.x v8, a0
-; CHECK-NEXT: vadd.vx v9, v8, a1
-; CHECK-NEXT: vrgather.vi v8, v9, 0
-; CHECK-NEXT: ret
+; RV32-LABEL: v2i32:
+; RV32: # %bb.0:
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; RV32-NEXT: vmv.v.x v8, a0
+; RV32-NEXT: ret
+;
+; RV64-LABEL: v2i32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; RV64-NEXT: vmv.v.x v8, a0
+; RV64-NEXT: vadd.vx v9, v8, a1
+; RV64-NEXT: vrgather.vi v8, v9, 0
+; RV64-NEXT: ret
%head.x = insertelement <2 x i32> poison, i32 %x, i32 0
%splat.x = shufflevector <2 x i32> %head.x, <2 x i32> poison, <2 x i32> zeroinitializer
%head.y = insertelement <2 x i32> poison, i32 %y, i32 0
@@ -409,13 +423,20 @@ define <2 x i32> @v2i32(i32 %x, i32 %y) {
}
define <4 x i32> @v4i32(i32 %x, i32 %y) {
-; CHECK-LABEL: v4i32:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT: vmv.v.x v8, a0
-; CHECK-NEXT: vadd.vx v9, v8, a1
-; CHECK-NEXT: vrgather.vi v8, v9, 0
-; CHECK-NEXT: ret
+; RV32-LABEL: v4i32:
+; RV32: # %bb.0:
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT: vmv.v.x v8, a0
+; RV32-NEXT: ret
+;
+; RV64-LABEL: v4i32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV64-NEXT: vmv.v.x v8, a0
+; RV64-NEXT: vadd.vx v9, v8, a1
+; RV64-NEXT: vrgather.vi v8, v9, 0
+; RV64-NEXT: ret
%head.x = insertelement <4 x i32> poison, i32 %x, i32 0
%splat.x = shufflevector <4 x i32> %head.x, <4 x i32> poison, <4 x i32> zeroinitializer
%head.y = insertelement <4 x i32> poison, i32 %y, i32 0
@@ -425,13 +446,20 @@ define <4 x i32> @v4i32(i32 %x, i32 %y) {
}
define <8 x i32> @v8i32(i32 %x, i32 %y) {
-; CHECK-LABEL: v8i32:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT: vmv.v.x v8, a0
-; CHECK-NEXT: vadd.vx v10, v8, a1
-; CHECK-NEXT: vrgather.vi v8, v10, 0
-; CHECK-NEXT: ret
+; RV32-LABEL: v8i32:
+; RV32: # %bb.0:
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT: vmv.v.x v8, a0
+; RV32-NEXT: ret
+;
+; RV64-LABEL: v8i32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT: vmv.v.x v8, a0
+; RV64-NEXT: vadd.vx v10, v8, a1
+; RV64-NEXT: vrgather.vi v8, v10, 0
+; RV64-NEXT: ret
%head.x = insertelement <8 x i32> poison, i32 %x, i32 0
%splat.x = shufflevector <8 x i32> %head.x, <8 x i32> poison, <8 x i32> zeroinitializer
%head.y = insertelement <8 x i32> poison, i32 %y, i32 0
@@ -441,13 +469,20 @@ define <8 x i32> @v8i32(i32 %x, i32 %y) {
}
define <16 x i32> @v16i32(i32 %x, i32 %y) {
-; CHECK-LABEL: v16i32:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT: vmv.v.x v8, a0
-; CHECK-NEXT: vadd.vx v12, v8, a1
-; CHECK-NEXT: vrgather.vi v8, v12, 0
-; CHECK-NEXT: ret
+; RV32-LABEL: v16i32:
+; RV32: # %bb.0:
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT: vmv.v.x v8, a0
+; RV32-NEXT: ret
+;
+; RV64-LABEL: v16i32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; RV64-NEXT: vmv.v.x v8, a0
+; RV64-NEXT: vadd.vx v12, v8, a1
+; RV64-NEXT: vrgather.vi v8, v12, 0
+; RV64-NEXT: ret
%head.x = insertelement <16 x i32> poison, i32 %x, i32 0
%splat.x = shufflevector <16 x i32> %head.x, <16 x i32> poison, <16 x i32> zeroinitializer
%head.y = insertelement <16 x i32> poison, i32 %y, i32 0
@@ -476,9 +511,9 @@ define <1 x i64> @v1i64(i64 %x, i64 %y) {
;
; RV64-LABEL: v1i64:
; RV64: # %bb.0:
+; RV64-NEXT: add a0, a0, a1
; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
; RV64-NEXT: vmv.s.x v8, a0
-; RV64-NEXT: vadd.vx v8, v8, a1
; RV64-NEXT: ret
%head.x = insertelement <1 x i64> poison, i64 %x, i32 0
%splat.x = shufflevector <1 x i64> %head.x, <1 x i64> poison, <1 x i32> zeroinitializer
@@ -509,10 +544,9 @@ define <2 x i64> @v2i64(i64 %x, i64 %y) {
;
; RV64-LABEL: v2i64:
; RV64: # %bb.0:
+; RV64-NEXT: add a0, a0, a1
; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; RV64-NEXT: vmv.v.x v8, a0
-; RV64-NEXT: vadd.vx v9, v8, a1
-; RV64-NEXT: vrgather.vi v8, v9, 0
; RV64-NEXT: ret
%head.x = insertelement <2 x i64> poison, i64 %x, i32 0
%splat.x = shufflevector <2 x i64> %head.x, <2 x i64> poison, <2 x i32> zeroinitializer
@@ -543,10 +577,9 @@ define <4 x i64> @v4i64(i64 %x, i64 %y) {
;
; RV64-LABEL: v4i64:
; RV64: # %bb.0:
+; RV64-NEXT: add a0, a0, a1
; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; RV64-NEXT: vmv.v.x v8, a0
-; RV64-NEXT: vadd.vx v10, v8, a1
-; RV64-NEXT: vrgather.vi v8, v10, 0
; RV64-NEXT: ret
%head.x = insertelement <4 x i64> poison, i64 %x, i32 0
%splat.x = shufflevector <4 x i64> %head.x, <4 x i64> poison, <4 x i32> zeroinitializer
@@ -577,10 +610,9 @@ define <8 x i64> @v8i64(i64 %x, i64 %y) {
;
; RV64-LABEL: v8i64:
; RV64: # %bb.0:
+; RV64-NEXT: add a0, a0, a1
; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
; RV64-NEXT: vmv.v.x v8, a0
-; RV64-NEXT: vadd.vx v12, v8, a1
-; RV64-NEXT: vrgather.vi v8, v12, 0
; RV64-NEXT: ret
%head.x = insertelement <8 x i64> poison, i64 %x, i32 0
%splat.x = shufflevector <8 x i64> %head.x, <8 x i64> poison, <8 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/SystemZ/dag-combine-03.ll b/llvm/test/CodeGen/SystemZ/dag-combine-03.ll
index 3625ac68b327f78..15750f54ed054f8 100644
--- a/llvm/test/CodeGen/SystemZ/dag-combine-03.ll
+++ b/llvm/test/CodeGen/SystemZ/dag-combine-03.ll
@@ -24,11 +24,7 @@ define void @fun(i64 %a0) {
; CHECK-NEXT: cgr %r0, %r2
; CHECK-NEXT: lhi %r0, 0
; CHECK-NEXT: lochie %r0, 1
-; CHECK-NEXT: vlvgp %v0, %r3, %r3
-; CHECK-NEXT: vlvgp %v1, %r0, %r0
-; CHECK-NEXT: vx %v0, %v0, %v1
-; CHECK-NEXT: vlgvf %r0, %v0, 1
-; CHECK-NEXT: chi %r0, 0
+; CHECK-NEXT: xr %r0, %r3
; CHECK-NEXT: locghie %r1, 0
; CHECK-NEXT: j .LBB0_1
entry:
diff --git a/llvm/test/CodeGen/SystemZ/pr60413.ll b/llvm/test/CodeGen/SystemZ/pr60413.ll
index 532dbde89c826c6..f76569630dd10e1 100644
--- a/llvm/test/CodeGen/SystemZ/pr60413.ll
+++ b/llvm/test/CodeGen/SystemZ/pr60413.ll
@@ -27,12 +27,11 @@ define dso_local void @m() local_unnamed_addr #1 {
; CHECK-NEXT: vlvgp %v0, %r0, %r1
; CHECK-NEXT: vlvgf %v0, %r0, 0
; CHECK-NEXT: vlvgf %v0, %r0, 2
-; CHECK-NEXT: vlvgp %v2, %r1, %r1
+; CHECK-NEXT: vgbm %v2, 30583
+; CHECK-NEXT: vn %v0, %v0, %v2
+; CHECK-NEXT: vn %v1, %v1, %v2
+; CHECK-NEXT: vlvgp %v2, %r0, %r0
; CHECK-NEXT: vrepf %v2, %v2, 1
-; CHECK-NEXT: vgbm %v3, 30583
-; CHECK-NEXT: vn %v0, %v0, %v3
-; CHECK-NEXT: vn %v1, %v1, %v3
-; CHECK-NEXT: vn %v2, %v2, %v3
; CHECK-NEXT: vrepif %v3, 127
; CHECK-NEXT: vchlf %v1, %v1, %v3
; CHECK-NEXT: vlgvf %r13, %v1, 0
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lsr-profitable-chain.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lsr-profitable-chain.ll
index 6d9dc46e1caa046..a60c804bea1dfb0 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lsr-profitable-chain.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lsr-profitable-chain.ll
@@ -11,13 +11,10 @@ define arm_aapcs_vfpcc float @vctpi32(ptr %0, i32 %1) {
; CHECK-LABEL: vctpi32:
; CHECK: @ %bb.0:
; CHECK-NEXT: push {r4, lr}
-; CHECK-NEXT: mvn r3, #31
-; CHECK-NEXT: vmov.32 q2[0], r0
-; CHECK-NEXT: movs r4, #0
; CHECK-NEXT: subs r2, r1, #1
-; CHECK-NEXT: vadd.i32 q2, q2, r3
+; CHECK-NEXT: movs r4, #0
+; CHECK-NEXT: subs r0, #32
; CHECK-NEXT: vidup.u32 q1, r4, #8
-; CHECK-NEXT: vmov r0, s8
; CHECK-NEXT: vadd.i32 q1, q1, r0
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: dlstp.32 lr, r2
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
index ad63e9ee9ff4c92..310f96474020843 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
@@ -460,16 +460,15 @@ define dso_local void @arm_mat_mult_q31(i32* noalias nocapture readonly %A, i32*
; CHECK-NEXT: movs r6, #1
; CHECK-NEXT: mov.w r8, #0
; CHECK-NEXT: add.w r7, r6, r7, lsr #1
-; CHECK-NEXT: vdup.32 q1, r9
+; CHECK-NEXT: vdup.32 q0, r9
; CHECK-NEXT: bic r7, r7, #3
-; CHECK-NEXT: vshl.i32 q3, q1, #3
+; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
; CHECK-NEXT: subs r7, #4
; CHECK-NEXT: add.w r10, r6, r7, lsr #2
; CHECK-NEXT: adr r7, .LCPI9_0
; CHECK-NEXT: adr r6, .LCPI9_1
; CHECK-NEXT: vldrw.u32 q2, [r7]
-; CHECK-NEXT: vldrw.u32 q0, [r6]
-; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q1, [r6]
; CHECK-NEXT: .LBB9_1: @ %for.cond8.preheader.us.us.preheader
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB9_2 Depth 2
@@ -481,33 +480,32 @@ define dso_local void @arm_mat_mult_q31(i32* noalias nocapture readonly %A, i32*
; CHECK-NEXT: @ Parent Loop BB9_1 Depth=1
; CHECK-NEXT: @ => This Loop Header: Depth=2
; CHECK-NEXT: @ Child Loop BB9_3 Depth 3
-; CHECK-NEXT: vdup.32 q5, r7
-; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload
-; CHECK-NEXT: vshl.i32 q5, q5, #2
-; CHECK-NEXT: vmov q6, q1
-; CHECK-NEXT: vadd.i32 q5, q5, r0
+; CHECK-NEXT: vldrw.u32 q6, [sp] @ 16-byte Reload
+; CHECK-NEXT: add.w r4, r0, r7, lsl #2
+; CHECK-NEXT: lsl.w r6, r9, #3
; CHECK-NEXT: dls lr, r10
-; CHECK-NEXT: vmov.i32 q4, #0x0
-; CHECK-NEXT: vadd.i32 q5, q5, q0
-; CHECK-NEXT: vmlas.i32 q6, q2, r5
+; CHECK-NEXT: vmov.i32 q3, #0x0
+; CHECK-NEXT: vmlas.i32 q6, q1, r5
+; CHECK-NEXT: vadd.i32 q4, q2, r4
+; CHECK-NEXT: vdup.32 q5, r6
; CHECK-NEXT: .LBB9_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB9_1 Depth=1
; CHECK-NEXT: @ Parent Loop BB9_2 Depth=2
; CHECK-NEXT: @ => This Inner Loop Header: Depth=3
-; CHECK-NEXT: vadd.i32 q7, q6, q3
+; CHECK-NEXT: vadd.i32 q7, q6, q5
; CHECK-NEXT: vldrw.u32 q0, [r1, q6, uxtw #2]
-; CHECK-NEXT: vldrw.u32 q6, [q5, #32]!
+; CHECK-NEXT: vldrw.u32 q6, [q4, #32]!
; CHECK-NEXT: vmul.i32 q0, q0, q6
; CHECK-NEXT: vmov q6, q7
-; CHECK-NEXT: vadd.i32 q4, q0, q4
+; CHECK-NEXT: vadd.i32 q3, q0, q3
; CHECK-NEXT: le lr, .LBB9_3
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB9_2 Depth=2
-; CHECK-NEXT: add.w r4, r5, r11
+; CHECK-NEXT: add.w r6, r5, r11
; CHECK-NEXT: adds r5, #1
-; CHECK-NEXT: vaddv.u32 r6, q4
+; CHECK-NEXT: vaddv.u32 r4, q3
; CHECK-NEXT: cmp r5, r9
-; CHECK-NEXT: str.w r6, [r2, r4, lsl #2]
+; CHECK-NEXT: str.w r4, [r2, r6, lsl #2]
; CHECK-NEXT: bne .LBB9_2
; CHECK-NEXT: @ %bb.5: @ %for.cond4.for.cond.cleanup6_crit_edge.us
; CHECK-NEXT: @ in Loop: Header=BB9_1 Depth=1
@@ -522,15 +520,15 @@ define dso_local void @arm_mat_mult_q31(i32* noalias nocapture readonly %A, i32*
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.7:
; CHECK-NEXT: .LCPI9_0:
-; CHECK-NEXT: .long 0 @ 0x0
-; CHECK-NEXT: .long 2 @ 0x2
-; CHECK-NEXT: .long 4 @ 0x4
-; CHECK-NEXT: .long 6 @ 0x6
-; CHECK-NEXT: .LCPI9_1:
; CHECK-NEXT: .long 4294967264 @ 0xffffffe0
; CHECK-NEXT: .long 4294967272 @ 0xffffffe8
; CHECK-NEXT: .long 4294967280 @ 0xfffffff0
; CHECK-NEXT: .long 4294967288 @ 0xfffffff8
+; CHECK-NEXT: .LCPI9_1:
+; CHECK-NEXT: .long 0 @ 0x0
+; CHECK-NEXT: .long 2 @ 0x2
+; CHECK-NEXT: .long 4 @ 0x4
+; CHECK-NEXT: .long 6 @ 0x6
for.cond8.preheader.us.us.preheader.preheader: ; preds = %entry
%0 = add i32 %l, -1
@@ -607,11 +605,11 @@ define dso_local void @arm_mat_mult_q15(i16* noalias nocapture readonly %A, i16*
; CHECK-NEXT: sub sp, #32
; CHECK-NEXT: strd r0, r2, [sp, #24] @ 8-byte Folded Spill
; CHECK-NEXT: cmp r3, #0
-; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill
; CHECK-NEXT: mov r0, r3
; CHECK-NEXT: itt ne
-; CHECK-NEXT: ldrne r0, [sp, #136]
-; CHECK-NEXT: cmpne r0, #0
+; CHECK-NEXT: ldrne.w r12, [sp, #136]
+; CHECK-NEXT: cmpne.w r12, #0
; CHECK-NEXT: bne .LBB10_2
; CHECK-NEXT: .LBB10_1: @ %for.cond.cleanup
; CHECK-NEXT: add sp, #32
@@ -619,24 +617,23 @@ define dso_local void @arm_mat_mult_q15(i16* noalias nocapture readonly %A, i16*
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
; CHECK-NEXT: .LBB10_2: @ %for.cond1.preheader.us.preheader
-; CHECK-NEXT: ldr.w r12, [sp, #140]
+; CHECK-NEXT: ldr.w r11, [sp, #140]
; CHECK-NEXT: movs r7, #1
-; CHECK-NEXT: mov.w r11, #0
+; CHECK-NEXT: lsl.w r6, r12, #1
; CHECK-NEXT: vmov.i32 q0, #0x0
-; CHECK-NEXT: bic r2, r12, #3
-; CHECK-NEXT: subs r3, r2, #4
+; CHECK-NEXT: bic r0, r11, #3
+; CHECK-NEXT: str r0, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT: subs r3, r0, #4
+; CHECK-NEXT: vdup.32 q1, r12
+; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: add.w r0, r7, r3, lsr #2
-; CHECK-NEXT: ldr r7, [sp, #136]
; CHECK-NEXT: adr r3, .LCPI10_0
-; CHECK-NEXT: str r0, [sp, #16] @ 4-byte Spill
-; CHECK-NEXT: lsl.w r0, r12, #1
-; CHECK-NEXT: vdup.32 q1, r7
+; CHECK-NEXT: str r0, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT: lsl.w r0, r11, #1
; CHECK-NEXT: vldrw.u32 q2, [r3]
-; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT: str r0, [sp] @ 4-byte Spill
+; CHECK-NEXT: movs r7, #0
; CHECK-NEXT: ldr r0, [sp, #24] @ 4-byte Reload
-; CHECK-NEXT: lsls r6, r7, #1
-; CHECK-NEXT: vshl.i32 q3, q1, #2
-; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill
; CHECK-NEXT: b .LBB10_5
; CHECK-NEXT: .LBB10_3: @ %for.cond5.preheader.us73.preheader
@@ -647,15 +644,15 @@ define dso_local void @arm_mat_mult_q15(i16* noalias nocapture readonly %A, i16*
; CHECK-NEXT: b .LBB10_15
; CHECK-NEXT: .LBB10_4: @ %for.cond1.for.cond.cleanup3_crit_edge.us
; CHECK-NEXT: @ in Loop: Header=BB10_5 Depth=1
+; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload
+; CHECK-NEXT: add r7, r11
+; CHECK-NEXT: ldr r2, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT: add r2, r0
+; CHECK-NEXT: str r2, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT: ldr r2, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT: add r11, r12
-; CHECK-NEXT: ldr r3, [sp, #20] @ 4-byte Reload
-; CHECK-NEXT: add r3, r0
-; CHECK-NEXT: str r3, [sp, #20] @ 4-byte Spill
-; CHECK-NEXT: ldr r3, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT: adds r3, #1
-; CHECK-NEXT: cmp r3, r0
+; CHECK-NEXT: adds r2, #1
+; CHECK-NEXT: cmp r2, r0
; CHECK-NEXT: beq .LBB10_1
; CHECK-NEXT: .LBB10_5: @ %for.cond1.preheader.us
; CHECK-NEXT: @ =>This Loop Header: Depth=1
@@ -663,9 +660,9 @@ define dso_local void @arm_mat_mult_q15(i16* noalias nocapture readonly %A, i16*
; CHECK-NEXT: @ Child Loop BB10_11 Depth 3
; CHECK-NEXT: @ Child Loop BB10_14 Depth 3
; CHECK-NEXT: @ Child Loop BB10_15 Depth 2
-; CHECK-NEXT: mul r5, r3, r7
-; CHECK-NEXT: cmp.w r12, #0
-; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT: mul r5, r2, r12
+; CHECK-NEXT: cmp.w r11, #0
+; CHECK-NEXT: str r2, [sp, #8] @ 4-byte Spill
; CHECK-NEXT: beq .LBB10_3
; CHECK-NEXT: @ %bb.6: @ %for.cond5.preheader.us.us.preheader
; CHECK-NEXT: @ in Loop: Header=BB10_5 Depth=1
@@ -676,7 +673,7 @@ define dso_local void @arm_mat_mult_q15(i16* noalias nocapture readonly %A, i16*
; CHECK-NEXT: ldr r3, [sp, #28] @ 4-byte Reload
; CHECK-NEXT: add.w r0, r8, r5
; CHECK-NEXT: add.w r8, r8, #1
-; CHECK-NEXT: cmp r8, r7
+; CHECK-NEXT: cmp r8, r12
; CHECK-NEXT: strh.w r10, [r3, r0, lsl #1]
; CHECK-NEXT: beq .LBB10_4
; CHECK-NEXT: .LBB10_8: @ %for.cond5.preheader.us.us
@@ -684,7 +681,7 @@ define dso_local void @arm_mat_mult_q15(i16* noalias nocapture readonly %A, i16*
; CHECK-NEXT: @ => This Loop Header: Depth=2
; CHECK-NEXT: @ Child Loop BB10_11 Depth 3
; CHECK-NEXT: @ Child Loop BB10_14 Depth 3
-; CHECK-NEXT: cmp.w r12, #3
+; CHECK-NEXT: cmp.w r11, #3
; CHECK-NEXT: bhi .LBB10_10
; CHECK-NEXT: @ %bb.9: @ in Loop: Header=BB10_8 Depth=2
; CHECK-NEXT: movs r4, #0
@@ -692,9 +689,11 @@ define dso_local void @arm_mat_mult_q15(i16* noalias nocapture readonly %A, i16*
; CHECK-NEXT: b .LBB10_13
; CHECK-NEXT: .LBB10_10: @ %vector.ph
; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2
-; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT: lsl.w r0, r12, #2
; CHECK-NEXT: vmov q5, q1
-; CHECK-NEXT: vmov.i32 q4, #0x0
+; CHECK-NEXT: vdup.32 q4, r0
+; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: vmov.i32 q3, #0x0
; CHECK-NEXT: vmlas.i32 q5, q2, r8
; CHECK-NEXT: dls lr, r0
; CHECK-NEXT: ldr r3, [sp, #20] @ 4-byte Reload
@@ -702,27 +701,26 @@ define dso_local void @arm_mat_mult_q15(i16* noalias nocapture readonly %A, i16*
; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1
; CHECK-NEXT: @ Parent Loop BB10_8 Depth=2
; CHECK-NEXT: @ => This Inner Loop Header: Depth=3
-; CHECK-NEXT: vadd.i32 q6, q5, q3
+; CHECK-NEXT: vadd.i32 q6, q5, q4
; CHECK-NEXT: vldrh.s32 q7, [r1, q5, uxtw #1]
; CHECK-NEXT: vldrh.s32 q5, [r3], #8
; CHECK-NEXT: vmul.i32 q5, q7, q5
-; CHECK-NEXT: vadd.i32 q4, q5, q4
+; CHECK-NEXT: vadd.i32 q3, q5, q3
; CHECK-NEXT: vmov q5, q6
; CHECK-NEXT: le lr, .LBB10_11
; CHECK-NEXT: @ %bb.12: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2
-; CHECK-NEXT: vaddv.u32 r10, q4
-; CHECK-NEXT: cmp r2, r12
-; CHECK-NEXT: mov r4, r2
+; CHECK-NEXT: ldr r4, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT: vaddv.u32 r10, q3
+; CHECK-NEXT: cmp r4, r11
; CHECK-NEXT: beq .LBB10_7
; CHECK-NEXT: .LBB10_13: @ %for.body8.us.us.preheader
; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2
-; CHECK-NEXT: mla r3, r7, r4, r8
-; CHECK-NEXT: add.w r0, r11, r4
-; CHECK-NEXT: ldr r7, [sp, #24] @ 4-byte Reload
-; CHECK-NEXT: sub.w lr, r12, r4
-; CHECK-NEXT: add.w r9, r7, r0, lsl #1
-; CHECK-NEXT: ldr r7, [sp, #136]
+; CHECK-NEXT: mla r3, r12, r4, r8
+; CHECK-NEXT: adds r0, r7, r4
+; CHECK-NEXT: ldr r2, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT: sub.w lr, r11, r4
+; CHECK-NEXT: add.w r9, r2, r0, lsl #1
; CHECK-NEXT: add.w r3, r1, r3, lsl #1
; CHECK-NEXT: .LBB10_14: @ %for.body8.us.us
; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1
@@ -867,12 +865,11 @@ define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(i8* nocapture readonly
; CHECK-NEXT: add.w r8, r7, #10
; CHECK-NEXT: adr r7, .LCPI11_0
; CHECK-NEXT: ldr r1, [sp, #96]
-; CHECK-NEXT: vdup.32 q0, r2
-; CHECK-NEXT: vldrw.u32 q1, [r7]
+; CHECK-NEXT: vldrw.u32 q0, [r7]
+; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill
; CHECK-NEXT: mov.w r10, #0
-; CHECK-NEXT: mov.w r9, #6
+; CHECK-NEXT: movs r3, #6
; CHECK-NEXT: movs r6, #11
-; CHECK-NEXT: vshl.i32 q0, q0, #2
; CHECK-NEXT: movs r5, #0
; CHECK-NEXT: .LBB11_1: @ %for.body10.i
; CHECK-NEXT: @ =>This Loop Header: Depth=1
@@ -881,7 +878,7 @@ define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(i8* nocapture readonly
; CHECK-NEXT: @ Child Loop BB11_4 Depth 4
; CHECK-NEXT: @ Child Loop BB11_5 Depth 5
; CHECK-NEXT: movs r7, #0
-; CHECK-NEXT: str r5, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT: str r5, [sp] @ 4-byte Spill
; CHECK-NEXT: .LBB11_2: @ %for.cond22.preheader.i
; CHECK-NEXT: @ Parent Loop BB11_1 Depth=1
; CHECK-NEXT: @ => This Loop Header: Depth=2
@@ -895,7 +892,7 @@ define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(i8* nocapture readonly
; CHECK-NEXT: @ => This Loop Header: Depth=3
; CHECK-NEXT: @ Child Loop BB11_4 Depth 4
; CHECK-NEXT: @ Child Loop BB11_5 Depth 5
-; CHECK-NEXT: dls lr, r9
+; CHECK-NEXT: dls lr, r3
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: mov.w r11, #4
; CHECK-NEXT: .LBB11_4: @ %for.body78.us.i
@@ -906,11 +903,13 @@ define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(i8* nocapture readonly
; CHECK-NEXT: @ Child Loop BB11_5 Depth 5
; CHECK-NEXT: mul r4, r11, r6
; CHECK-NEXT: vdup.32 q3, r5
+; CHECK-NEXT: lsl.w r9, r2, #2
; CHECK-NEXT: vdup.32 q2, r7
-; CHECK-NEXT: vadd.i32 q4, q1, r4
+; CHECK-NEXT: vadd.i32 q4, q0, r4
+; CHECK-NEXT: vdup.32 q1, r9
; CHECK-NEXT: vmla.i32 q3, q4, r2
; CHECK-NEXT: adds r4, #113
-; CHECK-NEXT: vadd.i32 q4, q1, r4
+; CHECK-NEXT: vadd.i32 q4, q0, r4
; CHECK-NEXT: mov r4, r8
; CHECK-NEXT: vmla.i32 q2, q4, r2
; CHECK-NEXT: .LBB11_5: @ %vector.body
@@ -920,8 +919,8 @@ define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(i8* nocapture readonly
; CHECK-NEXT: @ Parent Loop BB11_4 Depth=4
; CHECK-NEXT: @ => This Inner Loop Header: Depth=5
; CHECK-NEXT: vldrb.s32 q6, [r0, q2]
-; CHECK-NEXT: vadd.i32 q5, q2, q0
-; CHECK-NEXT: vadd.i32 q4, q3, q0
+; CHECK-NEXT: vadd.i32 q5, q2, q1
+; CHECK-NEXT: vadd.i32 q4, q3, q1
; CHECK-NEXT: subs r4, #4
; CHECK-NEXT: vadd.i32 q2, q6, r2
; CHECK-NEXT: vldrb.s32 q6, [r1, q3]
@@ -941,12 +940,13 @@ define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(i8* nocapture readonly
; CHECK-NEXT: bne .LBB11_3
; CHECK-NEXT: @ %bb.8: @ %for.cond.cleanup26.i
; CHECK-NEXT: @ in Loop: Header=BB11_2 Depth=2
+; CHECK-NEXT: ldr r5, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: adds r7, #1
-; CHECK-NEXT: cmp r7, r3
+; CHECK-NEXT: cmp r7, r5
; CHECK-NEXT: bne .LBB11_2
; CHECK-NEXT: @ %bb.9: @ %for.cond.cleanup20.i
; CHECK-NEXT: @ in Loop: Header=BB11_1 Depth=1
-; CHECK-NEXT: ldr r5, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: ldr r5, [sp] @ 4-byte Reload
; CHECK-NEXT: ldr r7, [sp, #148]
; CHECK-NEXT: adds r5, #1
; CHECK-NEXT: cmp r5, r7
diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll
index 8f969b8ad4c61a1..64057470be7fc59 100644
--- a/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll
@@ -245,9 +245,9 @@ entry:
define arm_aapcs_vfpcc void @ptr_v4i16_dup(i32 %v, <4 x ptr> %offs) {
; CHECK-LABEL: ptr_v4i16_dup:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vdup.32 q1, r0
+; CHECK-NEXT: uxth r0, r0
; CHECK-NEXT: movs r1, #0
-; CHECK-NEXT: vmovlb.u16 q1, q1
+; CHECK-NEXT: vdup.32 q1, r0
; CHECK-NEXT: vstrh.32 q1, [r1, q0]
; CHECK-NEXT: bx lr
entry:
diff --git a/llvm/test/CodeGen/Thumb2/mve-selectcc.ll b/llvm/test/CodeGen/Thumb2/mve-selectcc.ll
index 63c1d8a97587796..1c4255f8505504d 100644
--- a/llvm/test/CodeGen/Thumb2/mve-selectcc.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-selectcc.ll
@@ -210,19 +210,21 @@ define i32 @e() {
; CHECK-NEXT: adr r0, .LCPI14_0
; CHECK-NEXT: movs r1, #0
; CHECK-NEXT: vldrw.u32 q0, [r0]
-; CHECK-NEXT: movs r0, #4
+; CHECK-NEXT: mov.w r12, #4
; CHECK-NEXT: vmov q1, q0
; CHECK-NEXT: .LBB14_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: adds r1, #4
-; CHECK-NEXT: vadd.i32 q1, q1, r0
+; CHECK-NEXT: vadd.i32 q1, q1, r12
; CHECK-NEXT: cmp r1, #8
; CHECK-NEXT: csetm r2, eq
-; CHECK-NEXT: subs.w r3, r1, #8
+; CHECK-NEXT: subs.w r0, r1, #8
+; CHECK-NEXT: mvn.w r3, r2
; CHECK-NEXT: vdup.32 q2, r2
-; CHECK-NEXT: csel r1, r1, r3, ne
-; CHECK-NEXT: vbic q1, q1, q2
+; CHECK-NEXT: vdup.32 q3, r3
; CHECK-NEXT: vand q2, q2, q0
+; CHECK-NEXT: vand q1, q1, q3
+; CHECK-NEXT: csel r1, r1, r0, ne
; CHECK-NEXT: vorr q1, q2, q1
; CHECK-NEXT: b .LBB14_1
; CHECK-NEXT: .p2align 4
diff --git a/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll b/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll
index 217caeebe633560..4445f0e84dc8f56 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll
@@ -774,8 +774,9 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @zext16_02468101214_0ext(<8 x i16> %src1, i16 %src2) {
; CHECK-LABEL: zext16_02468101214_0ext:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vdup.32 q1, r0
-; CHECK-NEXT: vmullb.u16 q0, q0, q1
+; CHECK-NEXT: vmovlb.u16 q0, q0
+; CHECK-NEXT: uxth r0, r0
+; CHECK-NEXT: vmul.i32 q0, q0, r0
; CHECK-NEXT: bx lr
entry:
%shuf1 = shufflevector <8 x i16> %src1, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -790,8 +791,9 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @zext16_0ext_02468101214(<8 x i16> %src1, i16 %src2) {
; CHECK-LABEL: zext16_0ext_02468101214:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vdup.32 q1, r0
-; CHECK-NEXT: vmullb.u16 q0, q1, q0
+; CHECK-NEXT: vmovlb.u16 q0, q0
+; CHECK-NEXT: uxth r0, r0
+; CHECK-NEXT: vmul.i32 q0, q0, r0
; CHECK-NEXT: bx lr
entry:
%shuf1 = shufflevector <8 x i16> %src1, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -840,9 +842,9 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @zext16_13579111315_0ext(<8 x i16> %src1, i16 %src2) {
; CHECK-LABEL: zext16_13579111315_0ext:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vdup.32 q1, r0
-; CHECK-NEXT: vrev32.16 q0, q0
-; CHECK-NEXT: vmullb.u16 q0, q0, q1
+; CHECK-NEXT: vmovlt.u16 q0, q0
+; CHECK-NEXT: uxth r0, r0
+; CHECK-NEXT: vmul.i32 q0, q0, r0
; CHECK-NEXT: bx lr
entry:
%shuf1 = shufflevector <8 x i16> %src1, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -857,9 +859,9 @@ entry:
define arm_aapcs_vfpcc <4 x i32> @zext16_0ext_13579111315(<8 x i16> %src1, i16 %src2) {
; CHECK-LABEL: zext16_0ext_13579111315:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vrev32.16 q0, q0
-; CHECK-NEXT: vdup.32 q1, r0
-; CHECK-NEXT: vmullb.u16 q0, q1, q0
+; CHECK-NEXT: vmovlt.u16 q0, q0
+; CHECK-NEXT: uxth r0, r0
+; CHECK-NEXT: vmul.i32 q0, q0, r0
; CHECK-NEXT: bx lr
entry:
%shuf1 = shufflevector <8 x i16> %src1, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
diff --git a/llvm/test/CodeGen/X86/avx512-bugfix-23634.ll b/llvm/test/CodeGen/X86/avx512-bugfix-23634.ll
index ee20747e23dcc07..e393acfa9fdd803 100644
--- a/llvm/test/CodeGen/X86/avx512-bugfix-23634.ll
+++ b/llvm/test/CodeGen/X86/avx512-bugfix-23634.ll
@@ -8,16 +8,17 @@ define void @f_fu(ptr %ret, ptr %aa, float %b) {
; CHECK-LABEL: f_fu:
; CHECK: ## %bb.0: ## %allocas
; CHECK-NEXT: vcvttss2si %xmm0, %eax
-; CHECK-NEXT: vpbroadcastd %eax, %zmm0
-; CHECK-NEXT: vcvttps2dq (%rsi), %zmm1
-; CHECK-NEXT: vpsrld $31, %zmm0, %zmm2
-; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm2
-; CHECK-NEXT: vpsrad $1, %zmm2, %zmm2
-; CHECK-NEXT: movw $-21846, %ax ## imm = 0xAAAA
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vmovdqa32 {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1 {%k1}
-; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
-; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vcvttps2dq (%rsi), %zmm0
+; CHECK-NEXT: movl %eax, %ecx
+; CHECK-NEXT: shrl $31, %ecx
+; CHECK-NEXT: addl %eax, %ecx
+; CHECK-NEXT: sarl %ecx
+; CHECK-NEXT: movw $-21846, %dx ## imm = 0xAAAA
+; CHECK-NEXT: kmovw %edx, %k1
+; CHECK-NEXT: vmovdqa32 {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0 {%k1}
+; CHECK-NEXT: addl %eax, %ecx
+; CHECK-NEXT: vpbroadcastd %ecx, %zmm1
+; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: vcvtdq2ps %zmm0, %zmm0
; CHECK-NEXT: vmovups %zmm0, (%rdi)
; CHECK-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/combine-bitselect.ll b/llvm/test/CodeGen/X86/combine-bitselect.ll
index 6c266be808eaf0d..d323178532d03ab 100644
--- a/llvm/test/CodeGen/X86/combine-bitselect.ll
+++ b/llvm/test/CodeGen/X86/combine-bitselect.ll
@@ -195,24 +195,35 @@ define <2 x i64> @bitselect_v2i64_broadcast_rrr(<2 x i64> %a0, <2 x i64> %a1, i6
; SSE: # %bb.0:
; SSE-NEXT: movq %rdi, %xmm2
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
+; SSE-NEXT: notq %rdi
+; SSE-NEXT: movq %rdi, %xmm3
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1]
; SSE-NEXT: pand %xmm2, %xmm0
-; SSE-NEXT: pandn %xmm1, %xmm2
-; SSE-NEXT: por %xmm2, %xmm0
+; SSE-NEXT: pand %xmm1, %xmm3
+; SSE-NEXT: por %xmm3, %xmm0
; SSE-NEXT: retq
;
; XOP-LABEL: bitselect_v2i64_broadcast_rrr:
; XOP: # %bb.0:
; XOP-NEXT: vmovq %rdi, %xmm2
; XOP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
-; XOP-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0
+; XOP-NEXT: notq %rdi
+; XOP-NEXT: vmovq %rdi, %xmm3
+; XOP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1]
+; XOP-NEXT: vpand %xmm2, %xmm0, %xmm0
+; XOP-NEXT: vpand %xmm3, %xmm1, %xmm1
+; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX1-LABEL: bitselect_v2i64_broadcast_rrr:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovq %rdi, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
+; AVX1-NEXT: notq %rdi
+; AVX1-NEXT: vmovq %rdi, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1]
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpandn %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
@@ -220,8 +231,11 @@ define <2 x i64> @bitselect_v2i64_broadcast_rrr(<2 x i64> %a0, <2 x i64> %a1, i6
; AVX2: # %bb.0:
; AVX2-NEXT: vmovq %rdi, %xmm2
; AVX2-NEXT: vpbroadcastq %xmm2, %xmm2
+; AVX2-NEXT: notq %rdi
+; AVX2-NEXT: vmovq %rdi, %xmm3
+; AVX2-NEXT: vpbroadcastq %xmm3, %xmm3
; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpandn %xmm1, %xmm2, %xmm1
+; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
@@ -229,15 +243,21 @@ define <2 x i64> @bitselect_v2i64_broadcast_rrr(<2 x i64> %a0, <2 x i64> %a1, i6
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovq %rdi, %xmm2
; AVX512F-NEXT: vpbroadcastq %xmm2, %xmm2
+; AVX512F-NEXT: notq %rdi
+; AVX512F-NEXT: vmovq %rdi, %xmm3
+; AVX512F-NEXT: vpbroadcastq %xmm3, %xmm3
; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT: vpandn %xmm1, %xmm2, %xmm1
+; AVX512F-NEXT: vpand %xmm3, %xmm1, %xmm1
; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: bitselect_v2i64_broadcast_rrr:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpbroadcastq %rdi, %xmm2
-; AVX512VL-NEXT: vpternlogq $226, %xmm1, %xmm2, %xmm0
+; AVX512VL-NEXT: notq %rdi
+; AVX512VL-NEXT: vpbroadcastq %rdi, %xmm3
+; AVX512VL-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT: vpternlogq $248, %xmm3, %xmm1, %xmm0
; AVX512VL-NEXT: retq
%1 = insertelement <2 x i64> undef, i64 %a2, i32 0
%2 = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
@@ -252,38 +272,77 @@ define <2 x i64> @bitselect_v2i64_broadcast_rrr(<2 x i64> %a0, <2 x i64> %a1, i6
define <2 x i64> @bitselect_v2i64_broadcast_rrm(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) {
; SSE-LABEL: bitselect_v2i64_broadcast_rrm:
; SSE: # %bb.0:
-; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
+; SSE-NEXT: movq (%rdi), %rax
+; SSE-NEXT: movq %rax, %xmm2
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
+; SSE-NEXT: notq %rax
+; SSE-NEXT: movq %rax, %xmm3
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1]
; SSE-NEXT: pand %xmm2, %xmm0
-; SSE-NEXT: pandn %xmm1, %xmm2
-; SSE-NEXT: por %xmm2, %xmm0
+; SSE-NEXT: pand %xmm1, %xmm3
+; SSE-NEXT: por %xmm3, %xmm0
; SSE-NEXT: retq
;
; XOP-LABEL: bitselect_v2i64_broadcast_rrm:
; XOP: # %bb.0:
-; XOP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0]
-; XOP-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0
+; XOP-NEXT: movq (%rdi), %rax
+; XOP-NEXT: vmovq %rax, %xmm2
+; XOP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
+; XOP-NEXT: notq %rax
+; XOP-NEXT: vmovq %rax, %xmm3
+; XOP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1]
+; XOP-NEXT: vpand %xmm2, %xmm0, %xmm0
+; XOP-NEXT: vpand %xmm3, %xmm1, %xmm1
+; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
-; AVX-LABEL: bitselect_v2i64_broadcast_rrm:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0]
-; AVX-NEXT: vandps %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vandnps %xmm1, %xmm2, %xmm1
-; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: bitselect_v2i64_broadcast_rrm:
+; AVX1: # %bb.0:
+; AVX1-NEXT: movq (%rdi), %rax
+; AVX1-NEXT: vmovq %rax, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
+; AVX1-NEXT: notq %rax
+; AVX1-NEXT: vmovq %rax, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1]
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: bitselect_v2i64_broadcast_rrm:
+; AVX2: # %bb.0:
+; AVX2-NEXT: movq (%rdi), %rax
+; AVX2-NEXT: vmovq %rax, %xmm2
+; AVX2-NEXT: vpbroadcastq %xmm2, %xmm2
+; AVX2-NEXT: notq %rax
+; AVX2-NEXT: vmovq %rax, %xmm3
+; AVX2-NEXT: vpbroadcastq %xmm3, %xmm3
+; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: bitselect_v2i64_broadcast_rrm:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0]
-; AVX512F-NEXT: vandps %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT: vandnps %xmm1, %xmm2, %xmm1
-; AVX512F-NEXT: vorps %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: movq (%rdi), %rax
+; AVX512F-NEXT: vmovq %rax, %xmm2
+; AVX512F-NEXT: vpbroadcastq %xmm2, %xmm2
+; AVX512F-NEXT: notq %rax
+; AVX512F-NEXT: vmovq %rax, %xmm3
+; AVX512F-NEXT: vpbroadcastq %xmm3, %xmm3
+; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpand %xmm3, %xmm1, %xmm1
+; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: bitselect_v2i64_broadcast_rrm:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpternlogq $228, (%rdi){1to2}, %xmm1, %xmm0
+; AVX512VL-NEXT: movq (%rdi), %rax
+; AVX512VL-NEXT: vpbroadcastq %rax, %xmm2
+; AVX512VL-NEXT: notq %rax
+; AVX512VL-NEXT: vpbroadcastq %rax, %xmm3
+; AVX512VL-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT: vpternlogq $248, %xmm3, %xmm1, %xmm0
; AVX512VL-NEXT: retq
%a2 = load i64, ptr %p2
%1 = insertelement <2 x i64> undef, i64 %a2, i32 0
@@ -510,13 +569,15 @@ define <4 x i64> @bitselect_v4i64_broadcast_rrr(<4 x i64> %a0, <4 x i64> %a1, i6
; SSE: # %bb.0:
; SSE-NEXT: movq %rdi, %xmm4
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
+; SSE-NEXT: notq %rdi
+; SSE-NEXT: movq %rdi, %xmm5
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,1]
; SSE-NEXT: pand %xmm4, %xmm1
; SSE-NEXT: pand %xmm4, %xmm0
-; SSE-NEXT: movdqa %xmm4, %xmm5
-; SSE-NEXT: pandn %xmm3, %xmm5
-; SSE-NEXT: por %xmm5, %xmm1
-; SSE-NEXT: pandn %xmm2, %xmm4
-; SSE-NEXT: por %xmm4, %xmm0
+; SSE-NEXT: pand %xmm5, %xmm3
+; SSE-NEXT: por %xmm3, %xmm1
+; SSE-NEXT: pand %xmm2, %xmm5
+; SSE-NEXT: por %xmm5, %xmm0
; SSE-NEXT: retq
;
; XOP-LABEL: bitselect_v4i64_broadcast_rrr:
@@ -524,7 +585,13 @@ define <4 x i64> @bitselect_v4i64_broadcast_rrr(<4 x i64> %a0, <4 x i64> %a1, i6
; XOP-NEXT: vmovq %rdi, %xmm2
; XOP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2
-; XOP-NEXT: vpcmov %ymm2, %ymm1, %ymm0, %ymm0
+; XOP-NEXT: notq %rdi
+; XOP-NEXT: vmovq %rdi, %xmm3
+; XOP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1]
+; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3
+; XOP-NEXT: vandps %ymm2, %ymm0, %ymm0
+; XOP-NEXT: vandps %ymm3, %ymm1, %ymm1
+; XOP-NEXT: vorps %ymm1, %ymm0, %ymm0
; XOP-NEXT: retq
;
; AVX1-LABEL: bitselect_v4i64_broadcast_rrr:
@@ -532,8 +599,12 @@ define <4 x i64> @bitselect_v4i64_broadcast_rrr(<4 x i64> %a0, <4 x i64> %a1, i6
; AVX1-NEXT: vmovq %rdi, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2
+; AVX1-NEXT: notq %rdi
+; AVX1-NEXT: vmovq %rdi, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -541,8 +612,11 @@ define <4 x i64> @bitselect_v4i64_broadcast_rrr(<4 x i64> %a0, <4 x i64> %a1, i6
; AVX2: # %bb.0:
; AVX2-NEXT: vmovq %rdi, %xmm2
; AVX2-NEXT: vpbroadcastq %xmm2, %ymm2
+; AVX2-NEXT: notq %rdi
+; AVX2-NEXT: vmovq %rdi, %xmm3
+; AVX2-NEXT: vpbroadcastq %xmm3, %ymm3
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpandn %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
@@ -550,15 +624,21 @@ define <4 x i64> @bitselect_v4i64_broadcast_rrr(<4 x i64> %a0, <4 x i64> %a1, i6
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovq %rdi, %xmm2
; AVX512F-NEXT: vpbroadcastq %xmm2, %ymm2
+; AVX512F-NEXT: notq %rdi
+; AVX512F-NEXT: vmovq %rdi, %xmm3
+; AVX512F-NEXT: vpbroadcastq %xmm3, %ymm3
; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vpandn %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: bitselect_v4i64_broadcast_rrr:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpbroadcastq %rdi, %ymm2
-; AVX512VL-NEXT: vpternlogq $226, %ymm1, %ymm2, %ymm0
+; AVX512VL-NEXT: notq %rdi
+; AVX512VL-NEXT: vpbroadcastq %rdi, %ymm3
+; AVX512VL-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vpternlogq $248, %ymm3, %ymm1, %ymm0
; AVX512VL-NEXT: retq
%1 = insertelement <4 x i64> undef, i64 %a2, i32 0
%2 = shufflevector <4 x i64> %1, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -573,42 +653,84 @@ define <4 x i64> @bitselect_v4i64_broadcast_rrr(<4 x i64> %a0, <4 x i64> %a1, i6
define <4 x i64> @bitselect_v4i64_broadcast_rrm(<4 x i64> %a0, <4 x i64> %a1, ptr %p2) {
; SSE-LABEL: bitselect_v4i64_broadcast_rrm:
; SSE: # %bb.0:
-; SSE-NEXT: movq {{.*#+}} xmm4 = mem[0],zero
+; SSE-NEXT: movq (%rdi), %rax
+; SSE-NEXT: movq %rax, %xmm4
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
+; SSE-NEXT: notq %rax
+; SSE-NEXT: movq %rax, %xmm5
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,1]
; SSE-NEXT: pand %xmm4, %xmm1
; SSE-NEXT: pand %xmm4, %xmm0
-; SSE-NEXT: movdqa %xmm4, %xmm5
-; SSE-NEXT: pandn %xmm3, %xmm5
-; SSE-NEXT: por %xmm5, %xmm1
-; SSE-NEXT: pandn %xmm2, %xmm4
-; SSE-NEXT: por %xmm4, %xmm0
+; SSE-NEXT: pand %xmm5, %xmm3
+; SSE-NEXT: por %xmm3, %xmm1
+; SSE-NEXT: pand %xmm2, %xmm5
+; SSE-NEXT: por %xmm5, %xmm0
; SSE-NEXT: retq
;
; XOP-LABEL: bitselect_v4i64_broadcast_rrm:
; XOP: # %bb.0:
-; XOP-NEXT: vbroadcastsd (%rdi), %ymm2
-; XOP-NEXT: vpcmov %ymm2, %ymm1, %ymm0, %ymm0
+; XOP-NEXT: movq (%rdi), %rax
+; XOP-NEXT: vmovq %rax, %xmm2
+; XOP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
+; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2
+; XOP-NEXT: notq %rax
+; XOP-NEXT: vmovq %rax, %xmm3
+; XOP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1]
+; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3
+; XOP-NEXT: vandps %ymm2, %ymm0, %ymm0
+; XOP-NEXT: vandps %ymm3, %ymm1, %ymm1
+; XOP-NEXT: vorps %ymm1, %ymm0, %ymm0
; XOP-NEXT: retq
;
-; AVX-LABEL: bitselect_v4i64_broadcast_rrm:
-; AVX: # %bb.0:
-; AVX-NEXT: vbroadcastsd (%rdi), %ymm2
-; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
-; AVX-NEXT: vandnps %ymm1, %ymm2, %ymm1
-; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0
-; AVX-NEXT: retq
+; AVX1-LABEL: bitselect_v4i64_broadcast_rrm:
+; AVX1: # %bb.0:
+; AVX1-NEXT: movq (%rdi), %rax
+; AVX1-NEXT: vmovq %rax, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2
+; AVX1-NEXT: notq %rax
+; AVX1-NEXT: vmovq %rax, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3
+; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: bitselect_v4i64_broadcast_rrm:
+; AVX2: # %bb.0:
+; AVX2-NEXT: movq (%rdi), %rax
+; AVX2-NEXT: vmovq %rax, %xmm2
+; AVX2-NEXT: vpbroadcastq %xmm2, %ymm2
+; AVX2-NEXT: notq %rax
+; AVX2-NEXT: vmovq %rax, %xmm3
+; AVX2-NEXT: vpbroadcastq %xmm3, %ymm3
+; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: bitselect_v4i64_broadcast_rrm:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vbroadcastsd (%rdi), %ymm2
-; AVX512F-NEXT: vandps %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vandnps %ymm1, %ymm2, %ymm1
-; AVX512F-NEXT: vorps %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: movq (%rdi), %rax
+; AVX512F-NEXT: vmovq %rax, %xmm2
+; AVX512F-NEXT: vpbroadcastq %xmm2, %ymm2
+; AVX512F-NEXT: notq %rax
+; AVX512F-NEXT: vmovq %rax, %xmm3
+; AVX512F-NEXT: vpbroadcastq %xmm3, %ymm3
+; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: bitselect_v4i64_broadcast_rrm:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpternlogq $228, (%rdi){1to4}, %ymm1, %ymm0
+; AVX512VL-NEXT: movq (%rdi), %rax
+; AVX512VL-NEXT: vpbroadcastq %rax, %ymm2
+; AVX512VL-NEXT: notq %rax
+; AVX512VL-NEXT: vpbroadcastq %rax, %ymm3
+; AVX512VL-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vpternlogq $248, %ymm3, %ymm1, %ymm0
; AVX512VL-NEXT: retq
%a2 = load i64, ptr %p2
%1 = insertelement <4 x i64> undef, i64 %a2, i32 0
@@ -871,21 +993,21 @@ define <8 x i64> @bitselect_v8i64_broadcast_rrr(<8 x i64> %a0, <8 x i64> %a1, i6
; SSE: # %bb.0:
; SSE-NEXT: movq %rdi, %xmm8
; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,1]
+; SSE-NEXT: notq %rdi
+; SSE-NEXT: movq %rdi, %xmm9
+; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,0,1]
; SSE-NEXT: pand %xmm8, %xmm3
; SSE-NEXT: pand %xmm8, %xmm2
; SSE-NEXT: pand %xmm8, %xmm1
; SSE-NEXT: pand %xmm8, %xmm0
-; SSE-NEXT: movdqa %xmm8, %xmm9
-; SSE-NEXT: pandn %xmm7, %xmm9
-; SSE-NEXT: por %xmm9, %xmm3
-; SSE-NEXT: movdqa %xmm8, %xmm7
-; SSE-NEXT: pandn %xmm6, %xmm7
-; SSE-NEXT: por %xmm7, %xmm2
-; SSE-NEXT: movdqa %xmm8, %xmm6
-; SSE-NEXT: pandn %xmm5, %xmm6
-; SSE-NEXT: por %xmm6, %xmm1
-; SSE-NEXT: pandn %xmm4, %xmm8
-; SSE-NEXT: por %xmm8, %xmm0
+; SSE-NEXT: pand %xmm9, %xmm7
+; SSE-NEXT: por %xmm7, %xmm3
+; SSE-NEXT: pand %xmm9, %xmm6
+; SSE-NEXT: por %xmm6, %xmm2
+; SSE-NEXT: pand %xmm9, %xmm5
+; SSE-NEXT: por %xmm5, %xmm1
+; SSE-NEXT: pand %xmm4, %xmm9
+; SSE-NEXT: por %xmm9, %xmm0
; SSE-NEXT: retq
;
; XOP-LABEL: bitselect_v8i64_broadcast_rrr:
@@ -893,8 +1015,16 @@ define <8 x i64> @bitselect_v8i64_broadcast_rrr(<8 x i64> %a0, <8 x i64> %a1, i6
; XOP-NEXT: vmovq %rdi, %xmm4
; XOP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4
-; XOP-NEXT: vpcmov %ymm4, %ymm2, %ymm0, %ymm0
-; XOP-NEXT: vpcmov %ymm4, %ymm3, %ymm1, %ymm1
+; XOP-NEXT: notq %rdi
+; XOP-NEXT: vmovq %rdi, %xmm5
+; XOP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,1]
+; XOP-NEXT: vinsertf128 $1, %xmm5, %ymm5, %ymm5
+; XOP-NEXT: vandps %ymm4, %ymm1, %ymm1
+; XOP-NEXT: vandps %ymm4, %ymm0, %ymm0
+; XOP-NEXT: vandps %ymm5, %ymm3, %ymm3
+; XOP-NEXT: vorps %ymm3, %ymm1, %ymm1
+; XOP-NEXT: vandps %ymm5, %ymm2, %ymm2
+; XOP-NEXT: vorps %ymm2, %ymm0, %ymm0
; XOP-NEXT: retq
;
; AVX1-LABEL: bitselect_v8i64_broadcast_rrr:
@@ -902,11 +1032,15 @@ define <8 x i64> @bitselect_v8i64_broadcast_rrr(<8 x i64> %a0, <8 x i64> %a1, i6
; AVX1-NEXT: vmovq %rdi, %xmm4
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4
+; AVX1-NEXT: notq %rdi
+; AVX1-NEXT: vmovq %rdi, %xmm5
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm5, %ymm5
; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
-; AVX1-NEXT: vandnps %ymm3, %ymm4, %ymm3
+; AVX1-NEXT: vandps %ymm5, %ymm3, %ymm3
; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vandnps %ymm2, %ymm4, %ymm2
+; AVX1-NEXT: vandps %ymm5, %ymm2, %ymm2
; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -914,18 +1048,24 @@ define <8 x i64> @bitselect_v8i64_broadcast_rrr(<8 x i64> %a0, <8 x i64> %a1, i6
; AVX2: # %bb.0:
; AVX2-NEXT: vmovq %rdi, %xmm4
; AVX2-NEXT: vpbroadcastq %xmm4, %ymm4
+; AVX2-NEXT: notq %rdi
+; AVX2-NEXT: vmovq %rdi, %xmm5
+; AVX2-NEXT: vpbroadcastq %xmm5, %ymm5
; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
-; AVX2-NEXT: vpandn %ymm3, %ymm4, %ymm3
+; AVX2-NEXT: vpand %ymm5, %ymm3, %ymm3
; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpandn %ymm2, %ymm4, %ymm2
+; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2
; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: bitselect_v8i64_broadcast_rrr:
; AVX512: # %bb.0:
; AVX512-NEXT: vpbroadcastq %rdi, %zmm2
-; AVX512-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm0
+; AVX512-NEXT: notq %rdi
+; AVX512-NEXT: vpbroadcastq %rdi, %zmm3
+; AVX512-NEXT: vpandq %zmm2, %zmm0, %zmm0
+; AVX512-NEXT: vpternlogq $248, %zmm3, %zmm1, %zmm0
; AVX512-NEXT: retq
%1 = insertelement <8 x i64> undef, i64 %a2, i32 0
%2 = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -940,46 +1080,86 @@ define <8 x i64> @bitselect_v8i64_broadcast_rrr(<8 x i64> %a0, <8 x i64> %a1, i6
define <8 x i64> @bitselect_v8i64_broadcast_rrm(<8 x i64> %a0, <8 x i64> %a1, ptr %p2) {
; SSE-LABEL: bitselect_v8i64_broadcast_rrm:
; SSE: # %bb.0:
-; SSE-NEXT: movq {{.*#+}} xmm8 = mem[0],zero
+; SSE-NEXT: movq (%rdi), %rax
+; SSE-NEXT: movq %rax, %xmm8
; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,1]
+; SSE-NEXT: notq %rax
+; SSE-NEXT: movq %rax, %xmm9
+; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,0,1]
; SSE-NEXT: pand %xmm8, %xmm3
; SSE-NEXT: pand %xmm8, %xmm2
; SSE-NEXT: pand %xmm8, %xmm1
; SSE-NEXT: pand %xmm8, %xmm0
-; SSE-NEXT: movdqa %xmm8, %xmm9
-; SSE-NEXT: pandn %xmm7, %xmm9
-; SSE-NEXT: por %xmm9, %xmm3
-; SSE-NEXT: movdqa %xmm8, %xmm7
-; SSE-NEXT: pandn %xmm6, %xmm7
-; SSE-NEXT: por %xmm7, %xmm2
-; SSE-NEXT: movdqa %xmm8, %xmm6
-; SSE-NEXT: pandn %xmm5, %xmm6
-; SSE-NEXT: por %xmm6, %xmm1
-; SSE-NEXT: pandn %xmm4, %xmm8
-; SSE-NEXT: por %xmm8, %xmm0
+; SSE-NEXT: pand %xmm9, %xmm7
+; SSE-NEXT: por %xmm7, %xmm3
+; SSE-NEXT: pand %xmm9, %xmm6
+; SSE-NEXT: por %xmm6, %xmm2
+; SSE-NEXT: pand %xmm9, %xmm5
+; SSE-NEXT: por %xmm5, %xmm1
+; SSE-NEXT: pand %xmm4, %xmm9
+; SSE-NEXT: por %xmm9, %xmm0
; SSE-NEXT: retq
;
; XOP-LABEL: bitselect_v8i64_broadcast_rrm:
; XOP: # %bb.0:
-; XOP-NEXT: vbroadcastsd (%rdi), %ymm4
-; XOP-NEXT: vpcmov %ymm4, %ymm2, %ymm0, %ymm0
-; XOP-NEXT: vpcmov %ymm4, %ymm3, %ymm1, %ymm1
+; XOP-NEXT: movq (%rdi), %rax
+; XOP-NEXT: vmovq %rax, %xmm4
+; XOP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
+; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4
+; XOP-NEXT: notq %rax
+; XOP-NEXT: vmovq %rax, %xmm5
+; XOP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,1]
+; XOP-NEXT: vinsertf128 $1, %xmm5, %ymm5, %ymm5
+; XOP-NEXT: vandps %ymm4, %ymm1, %ymm1
+; XOP-NEXT: vandps %ymm4, %ymm0, %ymm0
+; XOP-NEXT: vandps %ymm5, %ymm3, %ymm3
+; XOP-NEXT: vorps %ymm3, %ymm1, %ymm1
+; XOP-NEXT: vandps %ymm5, %ymm2, %ymm2
+; XOP-NEXT: vorps %ymm2, %ymm0, %ymm0
; XOP-NEXT: retq
;
-; AVX-LABEL: bitselect_v8i64_broadcast_rrm:
-; AVX: # %bb.0:
-; AVX-NEXT: vbroadcastsd (%rdi), %ymm4
-; AVX-NEXT: vandps %ymm4, %ymm1, %ymm1
-; AVX-NEXT: vandps %ymm4, %ymm0, %ymm0
-; AVX-NEXT: vandnps %ymm3, %ymm4, %ymm3
-; AVX-NEXT: vorps %ymm3, %ymm1, %ymm1
-; AVX-NEXT: vandnps %ymm2, %ymm4, %ymm2
-; AVX-NEXT: vorps %ymm2, %ymm0, %ymm0
-; AVX-NEXT: retq
+; AVX1-LABEL: bitselect_v8i64_broadcast_rrm:
+; AVX1: # %bb.0:
+; AVX1-NEXT: movq (%rdi), %rax
+; AVX1-NEXT: vmovq %rax, %xmm4
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4
+; AVX1-NEXT: notq %rax
+; AVX1-NEXT: vmovq %rax, %xmm5
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm5, %ymm5
+; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
+; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
+; AVX1-NEXT: vandps %ymm5, %ymm3, %ymm3
+; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT: vandps %ymm5, %ymm2, %ymm2
+; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: bitselect_v8i64_broadcast_rrm:
+; AVX2: # %bb.0:
+; AVX2-NEXT: movq (%rdi), %rax
+; AVX2-NEXT: vmovq %rax, %xmm4
+; AVX2-NEXT: vpbroadcastq %xmm4, %ymm4
+; AVX2-NEXT: notq %rax
+; AVX2-NEXT: vmovq %rax, %xmm5
+; AVX2-NEXT: vpbroadcastq %xmm5, %ymm5
+; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
+; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm5, %ymm3, %ymm3
+; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: retq
;
; AVX512-LABEL: bitselect_v8i64_broadcast_rrm:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpternlogq $228, (%rdi){1to8}, %zmm1, %zmm0
+; AVX512-NEXT: movq (%rdi), %rax
+; AVX512-NEXT: vpbroadcastq %rax, %zmm2
+; AVX512-NEXT: notq %rax
+; AVX512-NEXT: vpbroadcastq %rax, %zmm3
+; AVX512-NEXT: vpandq %zmm2, %zmm0, %zmm0
+; AVX512-NEXT: vpternlogq $248, %zmm3, %zmm1, %zmm0
; AVX512-NEXT: retq
%a2 = load i64, ptr %p2
%1 = insertelement <8 x i64> undef, i64 %a2, i32 0
diff --git a/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll b/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll
index db52dd6b47b46cb..4d5f55b3312673b 100644
--- a/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll
@@ -67,9 +67,9 @@ declare <4 x i32> @llvm.vp.mul.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
define void @vp_sdiv_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind {
; SSE-LABEL: vp_sdiv_v4i32:
; SSE: # %bb.0:
+; SSE-NEXT: addl $-2147483648, %esi # imm = 0x80000000
; SSE-NEXT: movd %esi, %xmm2
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; SSE-NEXT: pand %xmm2, %xmm1
; SSE-NEXT: paddd %xmm2, %xmm1
@@ -210,9 +210,9 @@ declare <4 x i32> @llvm.vp.sdiv.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
define void @vp_udiv_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind {
; SSE-LABEL: vp_udiv_v4i32:
; SSE: # %bb.0:
+; SSE-NEXT: addl $-2147483648, %esi # imm = 0x80000000
; SSE-NEXT: movd %esi, %xmm2
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; SSE-NEXT: pand %xmm2, %xmm1
; SSE-NEXT: paddd %xmm2, %xmm1
@@ -353,9 +353,9 @@ declare <4 x i32> @llvm.vp.udiv.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
define void @vp_srem_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind {
; SSE-LABEL: vp_srem_v4i32:
; SSE: # %bb.0:
+; SSE-NEXT: addl $-2147483648, %esi # imm = 0x80000000
; SSE-NEXT: movd %esi, %xmm2
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; SSE-NEXT: pand %xmm2, %xmm1
; SSE-NEXT: paddd %xmm2, %xmm1
@@ -496,9 +496,9 @@ declare <4 x i32> @llvm.vp.srem.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
define void @vp_urem_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %out, i32 %vp) nounwind {
; SSE-LABEL: vp_urem_v4i32:
; SSE: # %bb.0:
+; SSE-NEXT: addl $-2147483648, %esi # imm = 0x80000000
; SSE-NEXT: movd %esi, %xmm2
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; SSE-NEXT: pand %xmm2, %xmm1
; SSE-NEXT: paddd %xmm2, %xmm1
diff --git a/llvm/test/CodeGen/X86/known-signbits-vector.ll b/llvm/test/CodeGen/X86/known-signbits-vector.ll
index e500801b69c4d00..462327b324c3de0 100644
--- a/llvm/test/CodeGen/X86/known-signbits-vector.ll
+++ b/llvm/test/CodeGen/X86/known-signbits-vector.ll
@@ -232,13 +232,27 @@ define float @signbits_ashr_insert_ashr_extract_sitofp(i64 %a0, i64 %a1) nounwin
; X86-NEXT: popl %eax
; X86-NEXT: retl
;
-; X64-LABEL: signbits_ashr_insert_ashr_extract_sitofp:
-; X64: # %bb.0:
-; X64-NEXT: sarq $30, %rdi
-; X64-NEXT: vmovq %rdi, %xmm0
-; X64-NEXT: vpsrlq $3, %xmm0, %xmm0
-; X64-NEXT: vcvtdq2ps %xmm0, %xmm0
-; X64-NEXT: retq
+; X64-AVX1-LABEL: signbits_ashr_insert_ashr_extract_sitofp:
+; X64-AVX1: # %bb.0:
+; X64-AVX1-NEXT: vmovq %rdi, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; X64-AVX1-NEXT: vpsrad $30, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpsrlq $30, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; X64-AVX1-NEXT: vpsrlq $3, %xmm0, %xmm0
+; X64-AVX1-NEXT: vcvtdq2ps %xmm0, %xmm0
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: signbits_ashr_insert_ashr_extract_sitofp:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: vmovq %rdi, %xmm0
+; X64-AVX2-NEXT: vpbroadcastq %xmm0, %xmm0
+; X64-AVX2-NEXT: vpsrad $30, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpsrlq $30, %xmm0, %xmm0
+; X64-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; X64-AVX2-NEXT: vpsrlq $3, %xmm0, %xmm0
+; X64-AVX2-NEXT: vcvtdq2ps %xmm0, %xmm0
+; X64-AVX2-NEXT: retq
%1 = ashr i64 %a0, 30
%2 = insertelement <2 x i64> undef, i64 %1, i32 0
%3 = insertelement <2 x i64> %2, i64 %a1, i32 1
diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
index 1289eef7795dccb..b4595fb42ef61c6 100644
--- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll
+++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
@@ -816,11 +816,13 @@ define <16 x float> @test14(ptr %base, i32 %ind, <16 x ptr> %vec) {
;
; KNL_32-LABEL: test14:
; KNL_32: # %bb.0:
-; KNL_32-NEXT: vmovd %xmm0, %eax
-; KNL_32-NEXT: vbroadcastss {{[0-9]+}}(%esp), %zmm1
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: vmovd %xmm0, %ecx
+; KNL_32-NEXT: shll $2, %eax
+; KNL_32-NEXT: vpbroadcastd %eax, %zmm1
; KNL_32-NEXT: kxnorw %k0, %k0, %k1
; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
+; KNL_32-NEXT: vgatherdps (%ecx,%zmm1), %zmm0 {%k1}
; KNL_32-NEXT: retl
;
; SKX-LABEL: test14:
@@ -836,11 +838,13 @@ define <16 x float> @test14(ptr %base, i32 %ind, <16 x ptr> %vec) {
;
; SKX_32-LABEL: test14:
; SKX_32: # %bb.0:
-; SKX_32-NEXT: vmovd %xmm0, %eax
-; SKX_32-NEXT: vbroadcastss {{[0-9]+}}(%esp), %zmm1
+; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SKX_32-NEXT: vmovd %xmm0, %ecx
+; SKX_32-NEXT: shll $2, %eax
+; SKX_32-NEXT: vpbroadcastd %eax, %zmm1
; SKX_32-NEXT: kxnorw %k0, %k0, %k1
; SKX_32-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; SKX_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
+; SKX_32-NEXT: vgatherdps (%ecx,%zmm1), %zmm0 {%k1}
; SKX_32-NEXT: retl
%broadcast.splatinsert = insertelement <16 x ptr> %vec, ptr %base, i32 1
diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll
index 1e31ee7ad6b590b..6f2732095ac9cd5 100644
--- a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll
+++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll
@@ -1074,6 +1074,7 @@ define void @merge_4i32_i32_combine(ptr %dst, ptr %src) {
; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X86-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X86-SSE1-NEXT: andps %xmm0, %xmm1
; X86-SSE1-NEXT: movaps %xmm1, (%eax)
diff --git a/llvm/test/CodeGen/X86/pr50609.ll b/llvm/test/CodeGen/X86/pr50609.ll
index ea85c312f38bd1c..960d24c0c23c34c 100644
--- a/llvm/test/CodeGen/X86/pr50609.ll
+++ b/llvm/test/CodeGen/X86/pr50609.ll
@@ -4,17 +4,7 @@
define void @PR50609(ptr noalias nocapture %RET, ptr noalias %aFOO, <16 x i32> %__mask) nounwind {
; CHECK-LABEL: PR50609:
; CHECK: # %bb.0: # %allocas
-; CHECK-NEXT: leal 40(%rsi), %eax
-; CHECK-NEXT: vmovq %rsi, %xmm2
-; CHECK-NEXT: vmovd %eax, %xmm3
-; CHECK-NEXT: vpsubq %xmm2, %xmm3, %xmm2
-; CHECK-NEXT: vpsrad $31, %xmm2, %xmm3
-; CHECK-NEXT: vpsrld $30, %xmm3, %xmm3
-; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm2
-; CHECK-NEXT: vpsrad $2, %xmm2, %xmm2
-; CHECK-NEXT: vcvtdq2ps %ymm2, %ymm2
-; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2
+; CHECK-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+1,1.0E+1,1.0E+1,1.0E+1,1.0E+1,1.0E+1,1.0E+1,1.0E+1]
; CHECK-NEXT: vmaskmovps %ymm2, %ymm0, (%rdi)
; CHECK-NEXT: vmaskmovps %ymm2, %ymm1, 32(%rdi)
; CHECK-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll
index 565946d342e9359..a6119ef3189e34d 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll
@@ -171,23 +171,25 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind {
; SSE2-LABEL: test_srem_vec:
; SSE2: # %bb.0:
; SSE2-NEXT: movq %rdx, %rcx
-; SSE2-NEXT: shlq $31, %rcx
-; SSE2-NEXT: sarq $31, %rcx
-; SSE2-NEXT: shlq $31, %rdi
-; SSE2-NEXT: sarq $31, %rdi
+; SSE2-NEXT: movq %rsi, %r8
+; SSE2-NEXT: movq %rdx, %rsi
; SSE2-NEXT: shlq $31, %rsi
; SSE2-NEXT: sarq $31, %rsi
-; SSE2-NEXT: movabsq $2049638230412172402, %r8 # imm = 0x1C71C71C71C71C72
-; SSE2-NEXT: movq %rsi, %rax
-; SSE2-NEXT: imulq %r8
+; SSE2-NEXT: shlq $31, %rdi
+; SSE2-NEXT: sarq $31, %rdi
+; SSE2-NEXT: shlq $31, %r8
+; SSE2-NEXT: sarq $31, %r8
+; SSE2-NEXT: movabsq $2049638230412172402, %r9 # imm = 0x1C71C71C71C71C72
+; SSE2-NEXT: movq %r8, %rax
+; SSE2-NEXT: imulq %r9
; SSE2-NEXT: movq %rdx, %rax
; SSE2-NEXT: shrq $63, %rax
; SSE2-NEXT: addq %rdx, %rax
; SSE2-NEXT: leaq (%rax,%rax,8), %rax
-; SSE2-NEXT: subq %rax, %rsi
-; SSE2-NEXT: movq %rsi, %xmm1
+; SSE2-NEXT: subq %rax, %r8
+; SSE2-NEXT: movq %r8, %xmm1
; SSE2-NEXT: movq %rdi, %rax
-; SSE2-NEXT: imulq %r8
+; SSE2-NEXT: imulq %r9
; SSE2-NEXT: movq %rdx, %rax
; SSE2-NEXT: shrq $63, %rax
; SSE2-NEXT: addq %rdx, %rax
@@ -195,26 +197,27 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind {
; SSE2-NEXT: subq %rax, %rdi
; SSE2-NEXT: movq %rdi, %xmm0
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [8589934591,8589934591]
-; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: movabsq $2049638230412172401, %rdx # imm = 0x1C71C71C71C71C71
-; SSE2-NEXT: movq %rcx, %rax
+; SSE2-NEXT: movq %rsi, %rax
; SSE2-NEXT: imulq %rdx
-; SSE2-NEXT: subq %rcx, %rdx
+; SSE2-NEXT: subq %rsi, %rdx
; SSE2-NEXT: movq %rdx, %rax
; SSE2-NEXT: shrq $63, %rax
; SSE2-NEXT: sarq $3, %rdx
; SSE2-NEXT: addq %rax, %rdx
; SSE2-NEXT: leaq (%rdx,%rdx,8), %rax
; SSE2-NEXT: addq %rcx, %rax
-; SSE2-NEXT: movq %rax, %xmm2
-; SSE2-NEXT: pand %xmm1, %xmm2
+; SSE2-NEXT: movabsq $8589934591, %rcx # imm = 0x1FFFFFFFF
+; SSE2-NEXT: andq %rax, %rcx
+; SSE2-NEXT: movq %rcx, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,2]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,3]
-; SSE2-NEXT: andps %xmm1, %xmm0
+; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm1[1,2]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,3]
+; SSE2-NEXT: andps %xmm2, %xmm0
; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
; SSE2-NEXT: pxor %xmm0, %xmm1
; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
@@ -226,23 +229,25 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind {
; SSE41-LABEL: test_srem_vec:
; SSE41: # %bb.0:
; SSE41-NEXT: movq %rdx, %rcx
-; SSE41-NEXT: shlq $31, %rcx
-; SSE41-NEXT: sarq $31, %rcx
-; SSE41-NEXT: shlq $31, %rdi
-; SSE41-NEXT: sarq $31, %rdi
+; SSE41-NEXT: movq %rsi, %r8
+; SSE41-NEXT: movq %rdx, %rsi
; SSE41-NEXT: shlq $31, %rsi
; SSE41-NEXT: sarq $31, %rsi
-; SSE41-NEXT: movabsq $2049638230412172402, %r8 # imm = 0x1C71C71C71C71C72
-; SSE41-NEXT: movq %rsi, %rax
-; SSE41-NEXT: imulq %r8
+; SSE41-NEXT: shlq $31, %rdi
+; SSE41-NEXT: sarq $31, %rdi
+; SSE41-NEXT: shlq $31, %r8
+; SSE41-NEXT: sarq $31, %r8
+; SSE41-NEXT: movabsq $2049638230412172402, %r9 # imm = 0x1C71C71C71C71C72
+; SSE41-NEXT: movq %r8, %rax
+; SSE41-NEXT: imulq %r9
; SSE41-NEXT: movq %rdx, %rax
; SSE41-NEXT: shrq $63, %rax
; SSE41-NEXT: addq %rdx, %rax
; SSE41-NEXT: leaq (%rax,%rax,8), %rax
-; SSE41-NEXT: subq %rax, %rsi
-; SSE41-NEXT: movq %rsi, %xmm1
+; SSE41-NEXT: subq %rax, %r8
+; SSE41-NEXT: movq %r8, %xmm1
; SSE41-NEXT: movq %rdi, %rax
-; SSE41-NEXT: imulq %r8
+; SSE41-NEXT: imulq %r9
; SSE41-NEXT: movq %rdx, %rax
; SSE41-NEXT: shrq $63, %rax
; SSE41-NEXT: addq %rdx, %rax
@@ -250,28 +255,29 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind {
; SSE41-NEXT: subq %rax, %rdi
; SSE41-NEXT: movq %rdi, %xmm0
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [8589934591,8589934591]
-; SSE41-NEXT: pand %xmm1, %xmm0
+; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE41-NEXT: movabsq $2049638230412172401, %rdx # imm = 0x1C71C71C71C71C71
-; SSE41-NEXT: movq %rcx, %rax
+; SSE41-NEXT: movq %rsi, %rax
; SSE41-NEXT: imulq %rdx
-; SSE41-NEXT: subq %rcx, %rdx
+; SSE41-NEXT: subq %rsi, %rdx
; SSE41-NEXT: movq %rdx, %rax
; SSE41-NEXT: shrq $63, %rax
; SSE41-NEXT: sarq $3, %rdx
; SSE41-NEXT: addq %rax, %rdx
; SSE41-NEXT: leaq (%rdx,%rdx,8), %rax
; SSE41-NEXT: addq %rcx, %rax
-; SSE41-NEXT: movq %rax, %xmm2
-; SSE41-NEXT: pand %xmm1, %xmm2
+; SSE41-NEXT: movabsq $8589934591, %rcx # imm = 0x1FFFFFFFF
+; SSE41-NEXT: andq %rax, %rcx
+; SSE41-NEXT: movq %rcx, %xmm1
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; SSE41-NEXT: pcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: pcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE41-NEXT: pxor %xmm1, %xmm2
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT: pxor %xmm2, %xmm0
+; SSE41-NEXT: pcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT: pxor %xmm2, %xmm1
; SSE41-NEXT: movd %xmm0, %eax
; SSE41-NEXT: pextrb $8, %xmm0, %edx
-; SSE41-NEXT: pextrb $0, %xmm2, %ecx
+; SSE41-NEXT: pextrb $0, %xmm1, %ecx
; SSE41-NEXT: # kill: def $al killed $al killed $eax
; SSE41-NEXT: # kill: def $dl killed $dl killed $edx
; SSE41-NEXT: # kill: def $cl killed $cl killed $ecx
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll
index 809d94b649fb48a..94aabbe730278e6 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll
@@ -915,11 +915,8 @@ define <16 x i1> @PR52500(<16 x i1> %msk, i32 %in) {
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
-; AVX512F-NEXT: vmovd %edi, %xmm0
-; AVX512F-NEXT: movl $789, %eax # imm = 0x315
-; AVX512F-NEXT: vmovd %eax, %xmm1
-; AVX512F-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT: vpbroadcastd %xmm0, %zmm0
+; AVX512F-NEXT: imull $789, %edi, %eax # imm = 0x315
+; AVX512F-NEXT: vpbroadcastd %eax, %zmm0
; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1 {%k1}
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
@@ -931,11 +928,8 @@ define <16 x i1> @PR52500(<16 x i1> %msk, i32 %in) {
; AVX512VL-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512VL-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k1
-; AVX512VL-NEXT: vmovd %edi, %xmm0
-; AVX512VL-NEXT: movl $789, %eax # imm = 0x315
-; AVX512VL-NEXT: vmovd %eax, %xmm1
-; AVX512VL-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vpbroadcastd %xmm0, %zmm0
+; AVX512VL-NEXT: imull $789, %edi, %eax # imm = 0x315
+; AVX512VL-NEXT: vpbroadcastd %eax, %zmm0
; AVX512VL-NEXT: vptestnmd %zmm0, %zmm0, %k1 {%k1}
; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
@@ -946,11 +940,8 @@ define <16 x i1> @PR52500(<16 x i1> %msk, i32 %in) {
; VL_BW_DQ: # %bb.0:
; VL_BW_DQ-NEXT: vpsllw $7, %xmm0, %xmm0
; VL_BW_DQ-NEXT: vpmovb2m %xmm0, %k1
-; VL_BW_DQ-NEXT: vmovd %edi, %xmm0
-; VL_BW_DQ-NEXT: movl $789, %eax # imm = 0x315
-; VL_BW_DQ-NEXT: vmovd %eax, %xmm1
-; VL_BW_DQ-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; VL_BW_DQ-NEXT: vpbroadcastd %xmm0, %zmm0
+; VL_BW_DQ-NEXT: imull $789, %edi, %eax # imm = 0x315
+; VL_BW_DQ-NEXT: vpbroadcastd %eax, %zmm0
; VL_BW_DQ-NEXT: vptestnmd %zmm0, %zmm0, %k0 {%k1}
; VL_BW_DQ-NEXT: vpmovm2b %k0, %xmm0
; VL_BW_DQ-NEXT: vzeroupper
More information about the llvm-commits
mailing list