[llvm] 742fb8b - [DAGCombine] Fold (store (insert_elt (load p)) x p) -> (store x)
Luke Lau via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 28 14:45:11 PDT 2023
Author: Luke Lau
Date: 2023-06-28T22:45:04+01:00
New Revision: 742fb8b5c7036409f08ab0706f00057ac29ac773
URL: https://github.com/llvm/llvm-project/commit/742fb8b5c7036409f08ab0706f00057ac29ac773
DIFF: https://github.com/llvm/llvm-project/commit/742fb8b5c7036409f08ab0706f00057ac29ac773.diff
LOG: [DAGCombine] Fold (store (insert_elt (load p)) x p) -> (store x)
If we have a store of a load with no other uses in between it, it's
considered dead and is removed. So sometimes when legalizing a fixed
length vector store of an insert, we end up producing better code
through scalarization than without.
An example is the follow below:
%a = load <4 x i64>, ptr %x
%b = insertelement <4 x i64> %a, i64 %y, i32 2
store <4 x i64> %b, ptr %x
If this is scalarized, then DAGCombine successfully removes 3 of the 4
stores which are considered dead, and on RISC-V we get:
sd a1, 16(a0)
However if we make the vector type legal (-mattr=+v), then we lose the
optimisation because we don't scalarize it.
This patch attempts to recover the optimisation for vectors by
identifying patterns where we store a load with a single insert
inbetween, replacing it with a scalar store of the inserted element.
Reviewed By: RKSimon
Differential Revision: https://reviews.llvm.org/D152276
Added:
Modified:
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/test/CodeGen/AArch64/vector-insert-shuffle-cycle.ll
llvm/test/CodeGen/ARM/fp16-promote.ll
llvm/test/CodeGen/ARM/vector-DAGCombine.ll
llvm/test/CodeGen/Hexagon/autohvx/hfinsert.ll
llvm/test/CodeGen/Mips/msa/basic_operations.ll
llvm/test/CodeGen/Mips/msa/basic_operations_float.ll
llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll
llvm/test/CodeGen/X86/pr47874.ll
llvm/test/CodeGen/X86/pr59980.ll
llvm/test/CodeGen/X86/vec_insert-mmx.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index b2853b90c8db2..8c0b3831694e8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -516,6 +516,7 @@ namespace {
SDValue replaceStoreChain(StoreSDNode *ST, SDValue BetterChain);
SDValue replaceStoreOfFPConstant(StoreSDNode *ST);
+ SDValue replaceStoreOfInsertLoad(StoreSDNode *ST);
bool refineExtractVectorEltIntoMultipleNarrowExtractVectorElts(SDNode *N);
@@ -20409,6 +20410,62 @@ SDValue DAGCombiner::replaceStoreOfFPConstant(StoreSDNode *ST) {
}
}
+// (store (insert_vector_elt (load p), x, i), p) -> (store x, p+offset)
+//
+// If a store of a load with an element inserted into it has no other
+// uses in between the chain, then we can consider the vector store
+// dead and replace it with just the single scalar element store.
+SDValue DAGCombiner::replaceStoreOfInsertLoad(StoreSDNode *ST) {
+ SDLoc DL(ST);
+ SDValue Value = ST->getValue();
+ SDValue Ptr = ST->getBasePtr();
+ SDValue Chain = ST->getChain();
+ if (Value.getOpcode() != ISD::INSERT_VECTOR_ELT || !Value.hasOneUse())
+ return SDValue();
+
+ SDValue Elt = Value.getOperand(1);
+ SDValue Idx = Value.getOperand(2);
+
+ // If the element isn't byte sized then we can't compute an offset
+ EVT EltVT = Elt.getValueType();
+ if (!EltVT.isByteSized())
+ return SDValue();
+
+ auto *Ld = dyn_cast<LoadSDNode>(Value.getOperand(0));
+ if (!Ld || Ld->getBasePtr() != Ptr ||
+ ST->getMemoryVT() != Ld->getMemoryVT() || !ST->isSimple() ||
+ !ISD::isNormalStore(ST) ||
+ Ld->getAddressSpace() != ST->getAddressSpace() ||
+ !Chain.reachesChainWithoutSideEffects(SDValue(Ld, 1)))
+ return SDValue();
+
+ unsigned IsFast;
+ if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
+ Elt.getValueType(), ST->getAddressSpace(),
+ ST->getAlign(), ST->getMemOperand()->getFlags(),
+ &IsFast) ||
+ !IsFast)
+ return SDValue();
+ EVT PtrVT = Ptr.getValueType();
+
+ SDValue Offset =
+ DAG.getNode(ISD::MUL, DL, PtrVT, Idx,
+ DAG.getConstant(EltVT.getSizeInBits() / 8, DL, PtrVT));
+ SDValue NewPtr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr, Offset);
+ MachinePointerInfo PointerInfo(ST->getAddressSpace());
+
+ // If the offset is a known constant then try to recover the pointer
+ // info
+ if (auto *CIdx = dyn_cast<ConstantSDNode>(Idx)) {
+ unsigned COffset = CIdx->getSExtValue() * EltVT.getSizeInBits() / 8;
+ NewPtr = DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(COffset), DL);
+ PointerInfo = ST->getPointerInfo().getWithOffset(COffset);
+ }
+
+ return DAG.getStore(Chain, DL, Elt, NewPtr, PointerInfo, ST->getAlign(),
+ ST->getMemOperand()->getFlags());
+}
+
SDValue DAGCombiner::visitSTORE(SDNode *N) {
StoreSDNode *ST = cast<StoreSDNode>(N);
SDValue Chain = ST->getChain();
@@ -20548,6 +20605,10 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
}
}
+ // Try scalarizing vector stores of loads where we only change one element
+ if (SDValue NewST = replaceStoreOfInsertLoad(ST))
+ return NewST;
+
// TODO: Can relax for unordered atomics (see D66309)
if (StoreSDNode *ST1 = dyn_cast<StoreSDNode>(Chain)) {
if (ST->isUnindexed() && ST->isSimple() &&
diff --git a/llvm/test/CodeGen/AArch64/vector-insert-shuffle-cycle.ll b/llvm/test/CodeGen/AArch64/vector-insert-shuffle-cycle.ll
index 98486a190cec0..a4e6a942d0fec 100644
--- a/llvm/test/CodeGen/AArch64/vector-insert-shuffle-cycle.ll
+++ b/llvm/test/CodeGen/AArch64/vector-insert-shuffle-cycle.ll
@@ -13,9 +13,7 @@ define void @test(i1 %c, ptr %ptr) {
; CHECK-NEXT: ; %bb.1: ; %bb1
; CHECK-NEXT: ldr d0, [x1]
; CHECK-NEXT: LBB0_2: ; %bb2
-; CHECK-NEXT: ldr q1, [x8]
-; CHECK-NEXT: mov.d v1[0], v0[0]
-; CHECK-NEXT: str q1, [x8]
+; CHECK-NEXT: str d0, [x8]
; CHECK-NEXT: ret
entry:
br i1 %c, label %bb1, label %bb2
diff --git a/llvm/test/CodeGen/ARM/fp16-promote.ll b/llvm/test/CodeGen/ARM/fp16-promote.ll
index ab8282c5aa120..69820850893ef 100644
--- a/llvm/test/CodeGen/ARM/fp16-promote.ll
+++ b/llvm/test/CodeGen/ARM/fp16-promote.ll
@@ -865,8 +865,14 @@ define void @test_fmuladd(ptr %p, ptr %q, ptr %r) #0 {
; CHECK-VFP: ldrh
; CHECK-VFP: stm
; CHECK-VFP: strh
-; CHECK-VFP: ldm
-; CHECK-VFP: stm
+; CHECK-VFP: ldrh
+; CHECK-VFP: ldrh
+; CHECK-VFP: ldrh
+; CHECK-VFP: ldrh
+; CHECK-VFP: strh
+; CHECK-VFP: strh
+; CHECK-VFP: strh
+; CHECK-VFP: strh
; CHECK-NOVFP: ldrh
; CHECK-NOVFP: ldrh
@@ -893,7 +899,7 @@ define void @test_insertelement(ptr %p, ptr %q, i32 %i) #0 {
%a = load half, ptr %p, align 2
%b = load <4 x half>, ptr %q, align 8
%c = insertelement <4 x half> %b, half %a, i32 %i
- store <4 x half> %c, ptr %q
+ store volatile <4 x half> %c, ptr %q
ret void
}
diff --git a/llvm/test/CodeGen/ARM/vector-DAGCombine.ll b/llvm/test/CodeGen/ARM/vector-DAGCombine.ll
index 2ba3423b834f2..fb21d1682bb38 100644
--- a/llvm/test/CodeGen/ARM/vector-DAGCombine.ll
+++ b/llvm/test/CodeGen/ARM/vector-DAGCombine.ll
@@ -134,9 +134,8 @@ define void @i64_buildvector(ptr %ptr, ptr %vp) nounwind {
define void @i64_insertelement(ptr %ptr, ptr %vp) nounwind {
; CHECK-LABEL: i64_insertelement:
; CHECK: @ %bb.0:
-; CHECK-NEXT: vld1.64 {d16, d17}, [r1]
-; CHECK-NEXT: vldr d16, [r0]
-; CHECK-NEXT: vst1.64 {d16, d17}, [r1]
+; CHECK-NEXT: ldm r0, {r2, r3}
+; CHECK-NEXT: strd r2, r3, [r1]
; CHECK-NEXT: bx lr
%t0 = load i64, ptr %ptr, align 4
%vec = load <2 x i64>, ptr %vp
diff --git a/llvm/test/CodeGen/Hexagon/autohvx/hfinsert.ll b/llvm/test/CodeGen/Hexagon/autohvx/hfinsert.ll
index 5e6365a33ec26..78092e813587b 100644
--- a/llvm/test/CodeGen/Hexagon/autohvx/hfinsert.ll
+++ b/llvm/test/CodeGen/Hexagon/autohvx/hfinsert.ll
@@ -9,7 +9,7 @@ define ptr @fred(ptr %v0) local_unnamed_addr #0 {
b0:
%v1 = load <64 x half>, ptr %v0, align 2
%v2 = insertelement <64 x half> %v1, half 0xH4170, i32 17
- store <64 x half> %v2, ptr %v0, align 2
+ store volatile <64 x half> %v2, ptr %v0, align 2
ret ptr %v0
}
diff --git a/llvm/test/CodeGen/Mips/msa/basic_operations.ll b/llvm/test/CodeGen/Mips/msa/basic_operations.ll
index 397fa7d69eb6a..fbae1bda7c665 100644
--- a/llvm/test/CodeGen/Mips/msa/basic_operations.ll
+++ b/llvm/test/CodeGen/Mips/msa/basic_operations.ll
@@ -1740,10 +1740,8 @@ define void @insert_v16i8(i32 signext %a) nounwind {
; O32-NEXT: addiu $2, $2, %lo(_gp_disp)
; O32-NEXT: addu $1, $2, $25
; O32-NEXT: lw $1, %got(v16i8)($1)
-; O32-NEXT: ld.b $w0, 0($1)
-; O32-NEXT: insert.b $w0[1], $4
; O32-NEXT: jr $ra
-; O32-NEXT: st.b $w0, 0($1)
+; O32-NEXT: sb $4, 1($1)
;
; N32-LABEL: insert_v16i8:
; N32: # %bb.0:
@@ -1751,10 +1749,8 @@ define void @insert_v16i8(i32 signext %a) nounwind {
; N32-NEXT: addu $1, $1, $25
; N32-NEXT: addiu $1, $1, %lo(%neg(%gp_rel(insert_v16i8)))
; N32-NEXT: lw $1, %got_disp(v16i8)($1)
-; N32-NEXT: ld.b $w0, 0($1)
-; N32-NEXT: insert.b $w0[1], $4
; N32-NEXT: jr $ra
-; N32-NEXT: st.b $w0, 0($1)
+; N32-NEXT: sb $4, 1($1)
;
; N64-LABEL: insert_v16i8:
; N64: # %bb.0:
@@ -1762,10 +1758,8 @@ define void @insert_v16i8(i32 signext %a) nounwind {
; N64-NEXT: daddu $1, $1, $25
; N64-NEXT: daddiu $1, $1, %lo(%neg(%gp_rel(insert_v16i8)))
; N64-NEXT: ld $1, %got_disp(v16i8)($1)
-; N64-NEXT: ld.b $w0, 0($1)
-; N64-NEXT: insert.b $w0[1], $4
; N64-NEXT: jr $ra
-; N64-NEXT: st.b $w0, 0($1)
+; N64-NEXT: sb $4, 1($1)
%1 = load <16 x i8>, ptr @v16i8
%a2 = trunc i32 %a to i8
%a3 = sext i8 %a2 to i32
@@ -1782,10 +1776,8 @@ define void @insert_v8i16(i32 signext %a) nounwind {
; O32-NEXT: addiu $2, $2, %lo(_gp_disp)
; O32-NEXT: addu $1, $2, $25
; O32-NEXT: lw $1, %got(v8i16)($1)
-; O32-NEXT: ld.h $w0, 0($1)
-; O32-NEXT: insert.h $w0[1], $4
; O32-NEXT: jr $ra
-; O32-NEXT: st.h $w0, 0($1)
+; O32-NEXT: sh $4, 2($1)
;
; N32-LABEL: insert_v8i16:
; N32: # %bb.0:
@@ -1793,10 +1785,8 @@ define void @insert_v8i16(i32 signext %a) nounwind {
; N32-NEXT: addu $1, $1, $25
; N32-NEXT: addiu $1, $1, %lo(%neg(%gp_rel(insert_v8i16)))
; N32-NEXT: lw $1, %got_disp(v8i16)($1)
-; N32-NEXT: ld.h $w0, 0($1)
-; N32-NEXT: insert.h $w0[1], $4
; N32-NEXT: jr $ra
-; N32-NEXT: st.h $w0, 0($1)
+; N32-NEXT: sh $4, 2($1)
;
; N64-LABEL: insert_v8i16:
; N64: # %bb.0:
@@ -1804,10 +1794,8 @@ define void @insert_v8i16(i32 signext %a) nounwind {
; N64-NEXT: daddu $1, $1, $25
; N64-NEXT: daddiu $1, $1, %lo(%neg(%gp_rel(insert_v8i16)))
; N64-NEXT: ld $1, %got_disp(v8i16)($1)
-; N64-NEXT: ld.h $w0, 0($1)
-; N64-NEXT: insert.h $w0[1], $4
; N64-NEXT: jr $ra
-; N64-NEXT: st.h $w0, 0($1)
+; N64-NEXT: sh $4, 2($1)
%1 = load <8 x i16>, ptr @v8i16
%a2 = trunc i32 %a to i16
%a3 = sext i16 %a2 to i32
@@ -1824,10 +1812,8 @@ define void @insert_v4i32(i32 signext %a) nounwind {
; O32-NEXT: addiu $2, $2, %lo(_gp_disp)
; O32-NEXT: addu $1, $2, $25
; O32-NEXT: lw $1, %got(v4i32)($1)
-; O32-NEXT: ld.w $w0, 0($1)
-; O32-NEXT: insert.w $w0[1], $4
; O32-NEXT: jr $ra
-; O32-NEXT: st.w $w0, 0($1)
+; O32-NEXT: sw $4, 4($1)
;
; N32-LABEL: insert_v4i32:
; N32: # %bb.0:
@@ -1835,10 +1821,8 @@ define void @insert_v4i32(i32 signext %a) nounwind {
; N32-NEXT: addu $1, $1, $25
; N32-NEXT: addiu $1, $1, %lo(%neg(%gp_rel(insert_v4i32)))
; N32-NEXT: lw $1, %got_disp(v4i32)($1)
-; N32-NEXT: ld.w $w0, 0($1)
-; N32-NEXT: insert.w $w0[1], $4
; N32-NEXT: jr $ra
-; N32-NEXT: st.w $w0, 0($1)
+; N32-NEXT: sw $4, 4($1)
;
; N64-LABEL: insert_v4i32:
; N64: # %bb.0:
@@ -1846,10 +1830,8 @@ define void @insert_v4i32(i32 signext %a) nounwind {
; N64-NEXT: daddu $1, $1, $25
; N64-NEXT: daddiu $1, $1, %lo(%neg(%gp_rel(insert_v4i32)))
; N64-NEXT: ld $1, %got_disp(v4i32)($1)
-; N64-NEXT: ld.w $w0, 0($1)
-; N64-NEXT: insert.w $w0[1], $4
; N64-NEXT: jr $ra
-; N64-NEXT: st.w $w0, 0($1)
+; N64-NEXT: sw $4, 4($1)
%1 = load <4 x i32>, ptr @v4i32
%2 = insertelement <4 x i32> %1, i32 %a, i32 1
store <4 x i32> %2, ptr @v4i32
@@ -1862,11 +1844,9 @@ define void @insert_v2i64(i64 signext %a) nounwind {
; O32-NEXT: addiu $2, $2, %lo(_gp_disp)
; O32-NEXT: addu $1, $2, $25
; O32-NEXT: lw $1, %got(v2i64)($1)
-; O32-NEXT: ld.w $w0, 0($1)
-; O32-NEXT: insert.w $w0[2], $4
-; O32-NEXT: insert.w $w0[3], $5
+; O32-NEXT: sw $5, 12($1)
; O32-NEXT: jr $ra
-; O32-NEXT: st.w $w0, 0($1)
+; O32-NEXT: sw $4, 8($1)
;
; N32-LABEL: insert_v2i64:
; N32: # %bb.0:
@@ -1874,10 +1854,8 @@ define void @insert_v2i64(i64 signext %a) nounwind {
; N32-NEXT: addu $1, $1, $25
; N32-NEXT: addiu $1, $1, %lo(%neg(%gp_rel(insert_v2i64)))
; N32-NEXT: lw $1, %got_disp(v2i64)($1)
-; N32-NEXT: ld.d $w0, 0($1)
-; N32-NEXT: insert.d $w0[1], $4
; N32-NEXT: jr $ra
-; N32-NEXT: st.d $w0, 0($1)
+; N32-NEXT: sd $4, 8($1)
;
; N64-LABEL: insert_v2i64:
; N64: # %bb.0:
@@ -1885,10 +1863,8 @@ define void @insert_v2i64(i64 signext %a) nounwind {
; N64-NEXT: daddu $1, $1, $25
; N64-NEXT: daddiu $1, $1, %lo(%neg(%gp_rel(insert_v2i64)))
; N64-NEXT: ld $1, %got_disp(v2i64)($1)
-; N64-NEXT: ld.d $w0, 0($1)
-; N64-NEXT: insert.d $w0[1], $4
; N64-NEXT: jr $ra
-; N64-NEXT: st.d $w0, 0($1)
+; N64-NEXT: sd $4, 8($1)
%1 = load <2 x i64>, ptr @v2i64
%2 = insertelement <2 x i64> %1, i64 %a, i32 1
store <2 x i64> %2, ptr @v2i64
@@ -1904,13 +1880,9 @@ define void @insert_v16i8_vidx(i32 signext %a) nounwind {
; O32-NEXT: lw $2, %got(i32)($1)
; O32-NEXT: lw $2, 0($2)
; O32-NEXT: lw $1, %got(v16i8)($1)
-; O32-NEXT: ld.b $w0, 0($1)
-; O32-NEXT: sld.b $w0, $w0[$2]
-; O32-NEXT: insert.b $w0[0], $4
-; O32-NEXT: neg $2, $2
-; O32-NEXT: sld.b $w0, $w0[$2]
+; O32-NEXT: addu $1, $1, $2
; O32-NEXT: jr $ra
-; O32-NEXT: st.b $w0, 0($1)
+; O32-NEXT: sb $4, 0($1)
;
; N32-LABEL: insert_v16i8_vidx:
; N32: # %bb.0:
@@ -1920,13 +1892,9 @@ define void @insert_v16i8_vidx(i32 signext %a) nounwind {
; N32-NEXT: lw $2, %got_disp(i32)($1)
; N32-NEXT: lw $2, 0($2)
; N32-NEXT: lw $1, %got_disp(v16i8)($1)
-; N32-NEXT: ld.b $w0, 0($1)
-; N32-NEXT: sld.b $w0, $w0[$2]
-; N32-NEXT: insert.b $w0[0], $4
-; N32-NEXT: neg $2, $2
-; N32-NEXT: sld.b $w0, $w0[$2]
+; N32-NEXT: addu $1, $1, $2
; N32-NEXT: jr $ra
-; N32-NEXT: st.b $w0, 0($1)
+; N32-NEXT: sb $4, 0($1)
;
; N64-LABEL: insert_v16i8_vidx:
; N64: # %bb.0:
@@ -1936,13 +1904,9 @@ define void @insert_v16i8_vidx(i32 signext %a) nounwind {
; N64-NEXT: ld $2, %got_disp(i32)($1)
; N64-NEXT: lwu $2, 0($2)
; N64-NEXT: ld $1, %got_disp(v16i8)($1)
-; N64-NEXT: ld.b $w0, 0($1)
-; N64-NEXT: sld.b $w0, $w0[$2]
-; N64-NEXT: insert.b $w0[0], $4
-; N64-NEXT: dneg $2, $2
-; N64-NEXT: sld.b $w0, $w0[$2]
+; N64-NEXT: daddu $1, $1, $2
; N64-NEXT: jr $ra
-; N64-NEXT: st.b $w0, 0($1)
+; N64-NEXT: sb $4, 0($1)
%1 = load <16 x i8>, ptr @v16i8
%2 = load i32, ptr @i32
%a2 = trunc i32 %a to i8
@@ -1962,14 +1926,9 @@ define void @insert_v8i16_vidx(i32 signext %a) nounwind {
; O32-NEXT: lw $2, %got(i32)($1)
; O32-NEXT: lw $2, 0($2)
; O32-NEXT: lw $1, %got(v8i16)($1)
-; O32-NEXT: ld.h $w0, 0($1)
-; O32-NEXT: sll $2, $2, 1
-; O32-NEXT: sld.b $w0, $w0[$2]
-; O32-NEXT: insert.h $w0[0], $4
-; O32-NEXT: neg $2, $2
-; O32-NEXT: sld.b $w0, $w0[$2]
+; O32-NEXT: lsa $1, $2, $1, 1
; O32-NEXT: jr $ra
-; O32-NEXT: st.h $w0, 0($1)
+; O32-NEXT: sh $4, 0($1)
;
; N32-LABEL: insert_v8i16_vidx:
; N32: # %bb.0:
@@ -1979,14 +1938,9 @@ define void @insert_v8i16_vidx(i32 signext %a) nounwind {
; N32-NEXT: lw $2, %got_disp(i32)($1)
; N32-NEXT: lw $2, 0($2)
; N32-NEXT: lw $1, %got_disp(v8i16)($1)
-; N32-NEXT: ld.h $w0, 0($1)
-; N32-NEXT: sll $2, $2, 1
-; N32-NEXT: sld.b $w0, $w0[$2]
-; N32-NEXT: insert.h $w0[0], $4
-; N32-NEXT: neg $2, $2
-; N32-NEXT: sld.b $w0, $w0[$2]
+; N32-NEXT: lsa $1, $2, $1, 1
; N32-NEXT: jr $ra
-; N32-NEXT: st.h $w0, 0($1)
+; N32-NEXT: sh $4, 0($1)
;
; N64-LABEL: insert_v8i16_vidx:
; N64: # %bb.0:
@@ -1996,14 +1950,9 @@ define void @insert_v8i16_vidx(i32 signext %a) nounwind {
; N64-NEXT: ld $2, %got_disp(i32)($1)
; N64-NEXT: lwu $2, 0($2)
; N64-NEXT: ld $1, %got_disp(v8i16)($1)
-; N64-NEXT: ld.h $w0, 0($1)
-; N64-NEXT: dsll $2, $2, 1
-; N64-NEXT: sld.b $w0, $w0[$2]
-; N64-NEXT: insert.h $w0[0], $4
-; N64-NEXT: dneg $2, $2
-; N64-NEXT: sld.b $w0, $w0[$2]
+; N64-NEXT: dlsa $1, $2, $1, 1
; N64-NEXT: jr $ra
-; N64-NEXT: st.h $w0, 0($1)
+; N64-NEXT: sh $4, 0($1)
%1 = load <8 x i16>, ptr @v8i16
%2 = load i32, ptr @i32
%a2 = trunc i32 %a to i16
@@ -2023,14 +1972,9 @@ define void @insert_v4i32_vidx(i32 signext %a) nounwind {
; O32-NEXT: lw $2, %got(i32)($1)
; O32-NEXT: lw $2, 0($2)
; O32-NEXT: lw $1, %got(v4i32)($1)
-; O32-NEXT: ld.w $w0, 0($1)
-; O32-NEXT: sll $2, $2, 2
-; O32-NEXT: sld.b $w0, $w0[$2]
-; O32-NEXT: insert.w $w0[0], $4
-; O32-NEXT: neg $2, $2
-; O32-NEXT: sld.b $w0, $w0[$2]
+; O32-NEXT: lsa $1, $2, $1, 2
; O32-NEXT: jr $ra
-; O32-NEXT: st.w $w0, 0($1)
+; O32-NEXT: sw $4, 0($1)
;
; N32-LABEL: insert_v4i32_vidx:
; N32: # %bb.0:
@@ -2040,14 +1984,9 @@ define void @insert_v4i32_vidx(i32 signext %a) nounwind {
; N32-NEXT: lw $2, %got_disp(i32)($1)
; N32-NEXT: lw $2, 0($2)
; N32-NEXT: lw $1, %got_disp(v4i32)($1)
-; N32-NEXT: ld.w $w0, 0($1)
-; N32-NEXT: sll $2, $2, 2
-; N32-NEXT: sld.b $w0, $w0[$2]
-; N32-NEXT: insert.w $w0[0], $4
-; N32-NEXT: neg $2, $2
-; N32-NEXT: sld.b $w0, $w0[$2]
+; N32-NEXT: lsa $1, $2, $1, 2
; N32-NEXT: jr $ra
-; N32-NEXT: st.w $w0, 0($1)
+; N32-NEXT: sw $4, 0($1)
;
; N64-LABEL: insert_v4i32_vidx:
; N64: # %bb.0:
@@ -2057,14 +1996,9 @@ define void @insert_v4i32_vidx(i32 signext %a) nounwind {
; N64-NEXT: ld $2, %got_disp(i32)($1)
; N64-NEXT: lwu $2, 0($2)
; N64-NEXT: ld $1, %got_disp(v4i32)($1)
-; N64-NEXT: ld.w $w0, 0($1)
-; N64-NEXT: dsll $2, $2, 2
-; N64-NEXT: sld.b $w0, $w0[$2]
-; N64-NEXT: insert.w $w0[0], $4
-; N64-NEXT: dneg $2, $2
-; N64-NEXT: sld.b $w0, $w0[$2]
+; N64-NEXT: dlsa $1, $2, $1, 2
; N64-NEXT: jr $ra
-; N64-NEXT: st.w $w0, 0($1)
+; N64-NEXT: sw $4, 0($1)
%1 = load <4 x i32>, ptr @v4i32
%2 = load i32, ptr @i32
%3 = insertelement <4 x i32> %1, i32 %a, i32 %2
@@ -2084,22 +2018,11 @@ define void @insert_v2i64_vidx(i64 signext %a) nounwind {
; O32-NEXT: addu $1, $2, $25
; O32-NEXT: lw $2, %got(i32)($1)
; O32-NEXT: lw $2, 0($2)
-; O32-NEXT: addu $2, $2, $2
; O32-NEXT: lw $1, %got(v2i64)($1)
-; O32-NEXT: ld.w $w0, 0($1)
-; O32-NEXT: sll $3, $2, 2
-; O32-NEXT: sld.b $w0, $w0[$3]
-; O32-NEXT: insert.w $w0[0], $4
-; O32-NEXT: neg $3, $3
-; O32-NEXT: sld.b $w0, $w0[$3]
-; O32-NEXT: addiu $2, $2, 1
-; O32-NEXT: sll $2, $2, 2
-; O32-NEXT: sld.b $w0, $w0[$2]
-; O32-NEXT: insert.w $w0[0], $5
-; O32-NEXT: neg $2, $2
-; O32-NEXT: sld.b $w0, $w0[$2]
+; O32-NEXT: lsa $1, $2, $1, 3
+; O32-NEXT: sw $5, 4($1)
; O32-NEXT: jr $ra
-; O32-NEXT: st.w $w0, 0($1)
+; O32-NEXT: sw $4, 0($1)
;
; N32-LABEL: insert_v2i64_vidx:
; N32: # %bb.0:
@@ -2109,14 +2032,9 @@ define void @insert_v2i64_vidx(i64 signext %a) nounwind {
; N32-NEXT: lw $2, %got_disp(i32)($1)
; N32-NEXT: lw $2, 0($2)
; N32-NEXT: lw $1, %got_disp(v2i64)($1)
-; N32-NEXT: ld.d $w0, 0($1)
-; N32-NEXT: sll $2, $2, 3
-; N32-NEXT: sld.b $w0, $w0[$2]
-; N32-NEXT: insert.d $w0[0], $4
-; N32-NEXT: neg $2, $2
-; N32-NEXT: sld.b $w0, $w0[$2]
+; N32-NEXT: lsa $1, $2, $1, 3
; N32-NEXT: jr $ra
-; N32-NEXT: st.d $w0, 0($1)
+; N32-NEXT: sd $4, 0($1)
;
; N64-LABEL: insert_v2i64_vidx:
; N64: # %bb.0:
@@ -2126,14 +2044,9 @@ define void @insert_v2i64_vidx(i64 signext %a) nounwind {
; N64-NEXT: ld $2, %got_disp(i32)($1)
; N64-NEXT: lwu $2, 0($2)
; N64-NEXT: ld $1, %got_disp(v2i64)($1)
-; N64-NEXT: ld.d $w0, 0($1)
-; N64-NEXT: dsll $2, $2, 3
-; N64-NEXT: sld.b $w0, $w0[$2]
-; N64-NEXT: insert.d $w0[0], $4
-; N64-NEXT: dneg $2, $2
-; N64-NEXT: sld.b $w0, $w0[$2]
+; N64-NEXT: dlsa $1, $2, $1, 3
; N64-NEXT: jr $ra
-; N64-NEXT: st.d $w0, 0($1)
+; N64-NEXT: sd $4, 0($1)
%1 = load <2 x i64>, ptr @v2i64
%2 = load i32, ptr @i32
%3 = insertelement <2 x i64> %1, i64 %a, i32 %2
diff --git a/llvm/test/CodeGen/Mips/msa/basic_operations_float.ll b/llvm/test/CodeGen/Mips/msa/basic_operations_float.ll
index 740e6169f81f9..83ac551c8dc94 100644
--- a/llvm/test/CodeGen/Mips/msa/basic_operations_float.ll
+++ b/llvm/test/CodeGen/Mips/msa/basic_operations_float.ll
@@ -275,7 +275,7 @@ define void @insert_v4f32(float %a) nounwind {
; float argument passed in $f12
; ALL-DAG: insve.w [[R1]][1], $w12[0]
- store <4 x float> %2, ptr @v4f32
+ store volatile <4 x float> %2, ptr @v4f32
; ALL-DAG: st.w [[R1]]
ret void
@@ -291,7 +291,7 @@ define void @insert_v2f64(double %a) nounwind {
; double argument passed in $f12
; ALL-DAG: insve.d [[R1]][1], $w12[0]
- store <2 x double> %2, ptr @v2f64
+ store volatile <2 x double> %2, ptr @v2f64
; ALL-DAG: st.d [[R1]]
ret void
@@ -319,7 +319,7 @@ define void @insert_v4f32_vidx(float %a) nounwind {
; ALL-DAG: neg [[NIDX:\$[0-9]+]], [[BIDX]]
; ALL-DAG: sld.b [[R1]], [[R1]][[[NIDX]]]
- store <4 x float> %3, ptr @v4f32
+ store volatile <4 x float> %3, ptr @v4f32
; ALL-DAG: st.w [[R1]]
ret void
@@ -347,7 +347,7 @@ define void @insert_v2f64_vidx(double %a) nounwind {
; ALL-DAG: neg [[NIDX:\$[0-9]+]], [[BIDX]]
; ALL-DAG: sld.b [[R1]], [[R1]][[[NIDX]]]
- store <2 x double> %3, ptr @v2f64
+ store volatile <2 x double> %3, ptr @v2f64
; ALL-DAG: st.d [[R1]]
ret void
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll
index fba7b706e63cc..078cf07ee740c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll
@@ -28,23 +28,13 @@ define <4 x i64> @insertelt_v4i64(<4 x i64> %a, i64 %y) {
define void @insertelt_v4i64_store(ptr %x, i64 %y) {
; RV32-LABEL: insertelt_v4i64_store:
; RV32: # %bb.0:
-; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT: vle64.v v8, (a0)
-; RV32-NEXT: vsetivli zero, 2, e32, m2, ta, ma
-; RV32-NEXT: vslide1down.vx v10, v8, a1
-; RV32-NEXT: vslide1down.vx v10, v10, a2
-; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT: vslideup.vi v8, v10, 3
-; RV32-NEXT: vse64.v v8, (a0)
+; RV32-NEXT: sw a2, 28(a0)
+; RV32-NEXT: sw a1, 24(a0)
; RV32-NEXT: ret
;
; RV64-LABEL: insertelt_v4i64_store:
; RV64: # %bb.0:
-; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; RV64-NEXT: vle64.v v8, (a0)
-; RV64-NEXT: vmv.s.x v10, a1
-; RV64-NEXT: vslideup.vi v8, v10, 3
-; RV64-NEXT: vse64.v v8, (a0)
+; RV64-NEXT: sd a1, 24(a0)
; RV64-NEXT: ret
%a = load <4 x i64>, ptr %x
%b = insertelement <4 x i64> %a, i64 %y, i32 3
@@ -96,24 +86,13 @@ define <3 x i64> @insertelt_v3i64(<3 x i64> %a, i64 %y) {
define void @insertelt_v3i64_store(ptr %x, i64 %y) {
; RV32-LABEL: insertelt_v3i64_store:
; RV32: # %bb.0:
-; RV32-NEXT: vsetivli zero, 3, e64, m2, ta, ma
-; RV32-NEXT: vle64.v v8, (a0)
-; RV32-NEXT: vsetivli zero, 2, e32, m2, ta, ma
-; RV32-NEXT: vslide1down.vx v10, v8, a1
-; RV32-NEXT: vslide1down.vx v10, v10, a2
-; RV32-NEXT: vsetivli zero, 3, e64, m2, tu, ma
-; RV32-NEXT: vslideup.vi v8, v10, 2
-; RV32-NEXT: vse64.v v8, (a0)
+; RV32-NEXT: sw a2, 20(a0)
+; RV32-NEXT: sw a1, 16(a0)
; RV32-NEXT: ret
;
; RV64-LABEL: insertelt_v3i64_store:
; RV64: # %bb.0:
-; RV64-NEXT: vsetivli zero, 3, e64, m2, ta, ma
-; RV64-NEXT: vle64.v v8, (a0)
-; RV64-NEXT: vmv.s.x v10, a1
-; RV64-NEXT: vsetvli zero, zero, e64, m2, tu, ma
-; RV64-NEXT: vslideup.vi v8, v10, 2
-; RV64-NEXT: vse64.v v8, (a0)
+; RV64-NEXT: sd a1, 16(a0)
; RV64-NEXT: ret
%a = load <3 x i64>, ptr %x, align 8
%b = insertelement <3 x i64> %a, i64 %y, i32 2
@@ -135,13 +114,7 @@ define <16 x i8> @insertelt_v16i8(<16 x i8> %a, i8 %y) {
define void @insertelt_v16i8_store(ptr %x, i8 %y) {
; CHECK-LABEL: insertelt_v16i8_store:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: vmv.s.x v9, a1
-; CHECK-NEXT: vsetivli zero, 15, e8, m1, tu, ma
-; CHECK-NEXT: vslideup.vi v8, v9, 14
-; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT: vse8.v v8, (a0)
+; CHECK-NEXT: sb a1, 14(a0)
; CHECK-NEXT: ret
%a = load <16 x i8>, ptr %x
%b = insertelement <16 x i8> %a, i8 %y, i32 14
@@ -178,30 +151,17 @@ define <32 x i16> @insertelt_v32i16(<32 x i16> %a, i16 %y, i32 %idx) {
define void @insertelt_v32i16_store(ptr %x, i16 %y, i32 %idx) {
; RV32-LABEL: insertelt_v32i16_store:
; RV32: # %bb.0:
-; RV32-NEXT: li a3, 32
-; RV32-NEXT: vsetvli zero, a3, e16, m4, ta, ma
-; RV32-NEXT: vle16.v v8, (a0)
-; RV32-NEXT: vmv.s.x v12, a1
-; RV32-NEXT: addi a1, a2, 1
-; RV32-NEXT: vsetvli zero, a1, e16, m4, tu, ma
-; RV32-NEXT: vslideup.vx v8, v12, a2
-; RV32-NEXT: vsetvli zero, a3, e16, m4, ta, ma
-; RV32-NEXT: vse16.v v8, (a0)
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: sh a1, 0(a0)
; RV32-NEXT: ret
;
; RV64-LABEL: insertelt_v32i16_store:
; RV64: # %bb.0:
-; RV64-NEXT: li a3, 32
-; RV64-NEXT: vsetvli zero, a3, e16, m4, ta, ma
-; RV64-NEXT: vle16.v v8, (a0)
-; RV64-NEXT: vmv.s.x v12, a1
; RV64-NEXT: slli a2, a2, 32
-; RV64-NEXT: srli a2, a2, 32
-; RV64-NEXT: addi a1, a2, 1
-; RV64-NEXT: vsetvli zero, a1, e16, m4, tu, ma
-; RV64-NEXT: vslideup.vx v8, v12, a2
-; RV64-NEXT: vsetvli zero, a3, e16, m4, ta, ma
-; RV64-NEXT: vse16.v v8, (a0)
+; RV64-NEXT: srli a2, a2, 31
+; RV64-NEXT: add a0, a0, a2
+; RV64-NEXT: sh a1, 0(a0)
; RV64-NEXT: ret
%a = load <32 x i16>, ptr %x
%b = insertelement <32 x i16> %a, i16 %y, i32 %idx
@@ -236,28 +196,17 @@ define <8 x float> @insertelt_v8f32(<8 x float> %a, float %y, i32 %idx) {
define void @insertelt_v8f32_store(ptr %x, float %y, i32 %idx) {
; RV32-LABEL: insertelt_v8f32_store:
; RV32: # %bb.0:
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vle32.v v8, (a0)
-; RV32-NEXT: vfmv.s.f v10, fa0
-; RV32-NEXT: addi a2, a1, 1
-; RV32-NEXT: vsetvli zero, a2, e32, m2, tu, ma
-; RV32-NEXT: vslideup.vx v8, v10, a1
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vse32.v v8, (a0)
+; RV32-NEXT: slli a1, a1, 2
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: fsw fa0, 0(a0)
; RV32-NEXT: ret
;
; RV64-LABEL: insertelt_v8f32_store:
; RV64: # %bb.0:
-; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV64-NEXT: vle32.v v8, (a0)
-; RV64-NEXT: vfmv.s.f v10, fa0
; RV64-NEXT: slli a1, a1, 32
-; RV64-NEXT: srli a1, a1, 32
-; RV64-NEXT: addi a2, a1, 1
-; RV64-NEXT: vsetvli zero, a2, e32, m2, tu, ma
-; RV64-NEXT: vslideup.vx v8, v10, a1
-; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV64-NEXT: vse32.v v8, (a0)
+; RV64-NEXT: srli a1, a1, 30
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: fsw fa0, 0(a0)
; RV64-NEXT: ret
%a = load <8 x float>, ptr %x
%b = insertelement <8 x float> %a, float %y, i32 %idx
@@ -277,15 +226,18 @@ define <8 x i64> @insertelt_v8i64_0(<8 x i64> %a, ptr %x) {
}
define void @insertelt_v8i64_0_store(ptr %x) {
-; CHECK-LABEL: insertelt_v8i64_0_store:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT: vle64.v v8, (a0)
-; CHECK-NEXT: li a1, -1
-; CHECK-NEXT: vsetvli zero, zero, e64, m4, tu, ma
-; CHECK-NEXT: vmv.s.x v8, a1
-; CHECK-NEXT: vse64.v v8, (a0)
-; CHECK-NEXT: ret
+; RV32-LABEL: insertelt_v8i64_0_store:
+; RV32: # %bb.0:
+; RV32-NEXT: li a1, -1
+; RV32-NEXT: sw a1, 4(a0)
+; RV32-NEXT: sw a1, 0(a0)
+; RV32-NEXT: ret
+;
+; RV64-LABEL: insertelt_v8i64_0_store:
+; RV64: # %bb.0:
+; RV64-NEXT: li a1, -1
+; RV64-NEXT: sd a1, 0(a0)
+; RV64-NEXT: ret
%a = load <8 x i64>, ptr %x
%b = insertelement <8 x i64> %a, i64 -1, i32 0
store <8 x i64> %b, ptr %x
@@ -321,30 +273,20 @@ define <8 x i64> @insertelt_v8i64(<8 x i64> %a, i32 %idx) {
define void @insertelt_v8i64_store(ptr %x, i32 %idx) {
; RV32-LABEL: insertelt_v8i64_store:
; RV32: # %bb.0:
-; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; RV32-NEXT: vle64.v v8, (a0)
-; RV32-NEXT: li a2, -1
-; RV32-NEXT: vmv.s.x v12, a2
-; RV32-NEXT: addi a2, a1, 1
-; RV32-NEXT: vsetvli zero, a2, e64, m4, tu, ma
-; RV32-NEXT: vslideup.vx v8, v12, a1
-; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; RV32-NEXT: vse64.v v8, (a0)
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: li a1, -1
+; RV32-NEXT: sw a1, 4(a0)
+; RV32-NEXT: sw a1, 0(a0)
; RV32-NEXT: ret
;
; RV64-LABEL: insertelt_v8i64_store:
; RV64: # %bb.0:
-; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT: vle64.v v8, (a0)
-; RV64-NEXT: li a2, -1
-; RV64-NEXT: vmv.s.x v12, a2
; RV64-NEXT: slli a1, a1, 32
-; RV64-NEXT: srli a1, a1, 32
-; RV64-NEXT: addi a2, a1, 1
-; RV64-NEXT: vsetvli zero, a2, e64, m4, tu, ma
-; RV64-NEXT: vslideup.vx v8, v12, a1
-; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT: vse64.v v8, (a0)
+; RV64-NEXT: srli a1, a1, 29
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: li a1, -1
+; RV64-NEXT: sd a1, 0(a0)
; RV64-NEXT: ret
%a = load <8 x i64>, ptr %x
%b = insertelement <8 x i64> %a, i64 -1, i32 %idx
@@ -364,15 +306,18 @@ define <8 x i64> @insertelt_c6_v8i64_0(<8 x i64> %a, ptr %x) {
}
define void @insertelt_c6_v8i64_0_store(ptr %x) {
-; CHECK-LABEL: insertelt_c6_v8i64_0_store:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT: vle64.v v8, (a0)
-; CHECK-NEXT: li a1, 6
-; CHECK-NEXT: vsetvli zero, zero, e64, m4, tu, ma
-; CHECK-NEXT: vmv.s.x v8, a1
-; CHECK-NEXT: vse64.v v8, (a0)
-; CHECK-NEXT: ret
+; RV32-LABEL: insertelt_c6_v8i64_0_store:
+; RV32: # %bb.0:
+; RV32-NEXT: sw zero, 4(a0)
+; RV32-NEXT: li a1, 6
+; RV32-NEXT: sw a1, 0(a0)
+; RV32-NEXT: ret
+;
+; RV64-LABEL: insertelt_c6_v8i64_0_store:
+; RV64: # %bb.0:
+; RV64-NEXT: li a1, 6
+; RV64-NEXT: sd a1, 0(a0)
+; RV64-NEXT: ret
%a = load <8 x i64>, ptr %x
%b = insertelement <8 x i64> %a, i64 6, i32 0
store <8 x i64> %b, ptr %x
@@ -408,30 +353,20 @@ define <8 x i64> @insertelt_c6_v8i64(<8 x i64> %a, i32 %idx) {
define void @insertelt_c6_v8i64_store(ptr %x, i32 %idx) {
; RV32-LABEL: insertelt_c6_v8i64_store:
; RV32: # %bb.0:
-; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; RV32-NEXT: vle64.v v8, (a0)
-; RV32-NEXT: li a2, 6
-; RV32-NEXT: vmv.s.x v12, a2
-; RV32-NEXT: addi a2, a1, 1
-; RV32-NEXT: vsetvli zero, a2, e64, m4, tu, ma
-; RV32-NEXT: vslideup.vx v8, v12, a1
-; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; RV32-NEXT: vse64.v v8, (a0)
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: sw zero, 4(a0)
+; RV32-NEXT: li a1, 6
+; RV32-NEXT: sw a1, 0(a0)
; RV32-NEXT: ret
;
; RV64-LABEL: insertelt_c6_v8i64_store:
; RV64: # %bb.0:
-; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT: vle64.v v8, (a0)
-; RV64-NEXT: li a2, 6
-; RV64-NEXT: vmv.s.x v12, a2
; RV64-NEXT: slli a1, a1, 32
-; RV64-NEXT: srli a1, a1, 32
-; RV64-NEXT: addi a2, a1, 1
-; RV64-NEXT: vsetvli zero, a2, e64, m4, tu, ma
-; RV64-NEXT: vslideup.vx v8, v12, a1
-; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT: vse64.v v8, (a0)
+; RV64-NEXT: srli a1, a1, 29
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: li a1, 6
+; RV64-NEXT: sd a1, 0(a0)
; RV64-NEXT: ret
%a = load <8 x i64>, ptr %x
%b = insertelement <8 x i64> %a, i64 6, i32 %idx
diff --git a/llvm/test/CodeGen/X86/pr47874.ll b/llvm/test/CodeGen/X86/pr47874.ll
index 96f76695b1729..2da3585357a1c 100644
--- a/llvm/test/CodeGen/X86/pr47874.ll
+++ b/llvm/test/CodeGen/X86/pr47874.ll
@@ -140,17 +140,16 @@ define void @c(ptr %arg, ptr %arg1, i32 %arg2) {
; SSE2-NEXT: testl %edx, %edx
; SSE2-NEXT: jle LBB2_3
; SSE2-NEXT: ## %bb.1: ## %bb4
-; SSE2-NEXT: movd %edx, %xmm0
-; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
; SSE2-NEXT: movl %edx, %eax
; SSE2-NEXT: .p2align 4, 0x90
; SSE2-NEXT: LBB2_2: ## %bb8
; SSE2-NEXT: ## =>This Inner Loop Header: Depth=1
; SSE2-NEXT: ## InlineAsm Start
; SSE2-NEXT: ## InlineAsm End
-; SSE2-NEXT: movaps (%rdi), %xmm0
-; SSE2-NEXT: addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
-; SSE2-NEXT: movaps %xmm0, (%rdi)
+; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 4-byte Folded Reload
+; SSE2-NEXT: movss %xmm0, (%rdi)
; SSE2-NEXT: addq $16, %rdi
; SSE2-NEXT: decq %rax
; SSE2-NEXT: jne LBB2_2
@@ -162,17 +161,17 @@ define void @c(ptr %arg, ptr %arg1, i32 %arg2) {
; AVX-NEXT: testl %edx, %edx
; AVX-NEXT: jle LBB2_3
; AVX-NEXT: ## %bb.1: ## %bb4
-; AVX-NEXT: vmovd %edx, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
; AVX-NEXT: movl %edx, %eax
; AVX-NEXT: .p2align 4, 0x90
; AVX-NEXT: LBB2_2: ## %bb8
; AVX-NEXT: ## =>This Inner Loop Header: Depth=1
; AVX-NEXT: ## InlineAsm Start
; AVX-NEXT: ## InlineAsm End
-; AVX-NEXT: vmovaps (%rdi), %xmm0
-; AVX-NEXT: vaddss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
-; AVX-NEXT: vmovaps %xmm0, (%rdi)
+; AVX-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 4-byte Reload
+; AVX-NEXT: ## xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: vaddss (%rdi), %xmm0, %xmm0
+; AVX-NEXT: vmovss %xmm0, (%rdi)
; AVX-NEXT: addq $16, %rdi
; AVX-NEXT: decq %rax
; AVX-NEXT: jne LBB2_2
@@ -210,16 +209,15 @@ define void @d(ptr %arg, ptr %arg1, i64 %arg2) {
; SSE2-NEXT: testq %rdx, %rdx
; SSE2-NEXT: jle LBB3_3
; SSE2-NEXT: ## %bb.1: ## %bb3
-; SSE2-NEXT: movq %rdx, %xmm0
-; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; SSE2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; SSE2-NEXT: .p2align 4, 0x90
; SSE2-NEXT: LBB3_2: ## %bb6
; SSE2-NEXT: ## =>This Inner Loop Header: Depth=1
; SSE2-NEXT: ## InlineAsm Start
; SSE2-NEXT: ## InlineAsm End
-; SSE2-NEXT: movapd (%rdi), %xmm0
-; SSE2-NEXT: addsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
-; SSE2-NEXT: movapd %xmm0, (%rdi)
+; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT: addsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 8-byte Folded Reload
+; SSE2-NEXT: movsd %xmm0, (%rdi)
; SSE2-NEXT: addq $16, %rdi
; SSE2-NEXT: decq %rdx
; SSE2-NEXT: jne LBB3_2
@@ -231,16 +229,16 @@ define void @d(ptr %arg, ptr %arg1, i64 %arg2) {
; AVX-NEXT: testq %rdx, %rdx
; AVX-NEXT: jle LBB3_3
; AVX-NEXT: ## %bb.1: ## %bb3
-; AVX-NEXT: vmovq %rdx, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; AVX-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
; AVX-NEXT: .p2align 4, 0x90
; AVX-NEXT: LBB3_2: ## %bb6
; AVX-NEXT: ## =>This Inner Loop Header: Depth=1
; AVX-NEXT: ## InlineAsm Start
; AVX-NEXT: ## InlineAsm End
-; AVX-NEXT: vmovapd (%rdi), %xmm0
-; AVX-NEXT: vaddsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
-; AVX-NEXT: vmovapd %xmm0, (%rdi)
+; AVX-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 8-byte Reload
+; AVX-NEXT: ## xmm0 = mem[0],zero
+; AVX-NEXT: vaddsd (%rdi), %xmm0, %xmm0
+; AVX-NEXT: vmovsd %xmm0, (%rdi)
; AVX-NEXT: addq $16, %rdi
; AVX-NEXT: decq %rdx
; AVX-NEXT: jne LBB3_2
diff --git a/llvm/test/CodeGen/X86/pr59980.ll b/llvm/test/CodeGen/X86/pr59980.ll
index 1179c976e7cf1..0823f960724e2 100644
--- a/llvm/test/CodeGen/X86/pr59980.ll
+++ b/llvm/test/CodeGen/X86/pr59980.ll
@@ -7,21 +7,9 @@
define void @foo(ptr %0, ptr %1, ptr %2) #0 {
; CHECK-LABEL: foo:
; CHECK: ## %bb.0:
-; CHECK-NEXT: pushq %rbp
-; CHECK-NEXT: movq %rsp, %rbp
-; CHECK-NEXT: andq $-32, %rsp
-; CHECK-NEXT: subq $64, %rsp
; CHECK-NEXT: movl (%rdx), %eax
-; CHECK-NEXT: andl $15, %eax
; CHECK-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0
-; CHECK-NEXT: vmovups (%rsi), %ymm1
-; CHECK-NEXT: vmovaps %ymm1, (%rsp)
-; CHECK-NEXT: vpextrw $0, %xmm0, (%rsp,%rax,2)
-; CHECK-NEXT: vmovaps (%rsp), %ymm0
-; CHECK-NEXT: vmovups %ymm0, (%rsi)
-; CHECK-NEXT: movq %rbp, %rsp
-; CHECK-NEXT: popq %rbp
-; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: vpextrw $0, %xmm0, (%rsi,%rax,2)
; CHECK-NEXT: retq
%4 = bitcast ptr %2 to ptr
%5 = load i64, ptr %4, align 8
diff --git a/llvm/test/CodeGen/X86/vec_insert-mmx.ll b/llvm/test/CodeGen/X86/vec_insert-mmx.ll
index 72b71faadae63..f561a2a20e194 100644
--- a/llvm/test/CodeGen/X86/vec_insert-mmx.ll
+++ b/llvm/test/CodeGen/X86/vec_insert-mmx.ll
@@ -59,19 +59,17 @@ define void @t3() {
; X86-LABEL: t3:
; X86: ## %bb.0:
; X86-NEXT: movl L_g0$non_lazy_ptr, %eax
+; X86-NEXT: movzwl (%eax), %eax
; X86-NEXT: movl L_g1$non_lazy_ptr, %ecx
-; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT: pinsrw $0, (%eax), %xmm0
-; X86-NEXT: movq %xmm0, (%ecx)
+; X86-NEXT: movw %ax, (%ecx)
; X86-NEXT: retl
;
; X64-LABEL: t3:
; X64: ## %bb.0:
; X64-NEXT: movq _g0 at GOTPCREL(%rip), %rax
+; X64-NEXT: movzwl (%rax), %eax
; X64-NEXT: movq _g1 at GOTPCREL(%rip), %rcx
-; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT: pinsrw $0, (%rax), %xmm0
-; X64-NEXT: movq %xmm0, (%rcx)
+; X64-NEXT: movw %ax, (%rcx)
; X64-NEXT: retq
load i16, ptr @g0
load <4 x i16>, ptr @g1
More information about the llvm-commits
mailing list