[llvm] 7a94d4f - Allow combining of extract_subvector to extract element
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Fri Jan 24 11:03:09 PST 2020
Author: Stanislav Mekhanoshin
Date: 2020-01-24T10:50:26-08:00
New Revision: 7a94d4f4ee435386ff47f7f3ecad4e56608578b6
URL: https://github.com/llvm/llvm-project/commit/7a94d4f4ee435386ff47f7f3ecad4e56608578b6
DIFF: https://github.com/llvm/llvm-project/commit/7a94d4f4ee435386ff47f7f3ecad4e56608578b6.diff
LOG: Allow combining of extract_subvector to extract element
Differential Revision: https://reviews.llvm.org/D73132
Added:
Modified:
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
llvm/test/CodeGen/ARM/vdup.ll
llvm/test/CodeGen/ARM/vext.ll
llvm/test/CodeGen/ARM/vpadd.ll
llvm/test/CodeGen/ARM/vuzp.ll
llvm/test/CodeGen/X86/sse41.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 4274bdffac0e..1d3c5fd6ed22 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -18581,6 +18581,13 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {
V.getOperand(0), NewIndex);
return DAG.getBitcast(NVT, NewExtract);
}
+ if (NewExtNumElts == 1 &&
+ TLI.isOperationLegalOrCustom(ISD::EXTRACT_VECTOR_ELT, ScalarVT)) {
+ SDValue NewIndex = DAG.getVectorIdxConstant(IndexValScaled, DL);
+ SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT,
+ V.getOperand(0), NewIndex);
+ return DAG.getBitcast(NVT, NewExtract);
+ }
}
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
index 3e8384ad30d5..00410b76defb 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
@@ -5,9 +5,8 @@ define <4 x half> @shuffle_v4f16_23uu(<4 x half> addrspace(1)* %arg0, <4 x half>
; GFX9-LABEL: shuffle_v4f16_23uu:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
@@ -19,10 +18,10 @@ define <4 x half> @shuffle_v4f16_234u(<4 x half> addrspace(1)* %arg0, <4 x half>
; GFX9-LABEL: shuffle_v4f16_234u:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX9-NEXT: global_load_dwordx2 v[1:2], v[2:3], off
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
+; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_mov_b32_e32 v0, v5
+; GFX9-NEXT: v_mov_b32_e32 v1, v2
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
@@ -154,7 +153,7 @@ define <4 x half> @shuffle_v4f16_0101(<4 x half> addrspace(1)* %arg0, <4 x half>
; GFX9-LABEL: shuffle_v4f16_0101:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -181,9 +180,8 @@ define <4 x half> @shuffle_v4f16_0145(<4 x half> addrspace(1)* %arg0, <4 x half>
; GFX9-LABEL: shuffle_v4f16_0145:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[1:2], v[2:3], off
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: global_load_dword v1, v[2:3], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
@@ -196,11 +194,9 @@ define <4 x half> @shuffle_v4f16_0167(<4 x half> addrspace(1)* %arg0, <4 x half>
; GFX9-LABEL: shuffle_v4f16_0167:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[1:2], v[2:3], off
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: global_load_dword v1, v[2:3], off offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
@@ -226,9 +222,9 @@ define <4 x half> @shuffle_v4f16_2323(<4 x half> addrspace(1)* %arg0, <4 x half>
; GFX9-LABEL: shuffle_v4f16_2323:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v1, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
@@ -240,10 +236,8 @@ define <4 x half> @shuffle_v4f16_2345(<4 x half> addrspace(1)* %arg0, <4 x half>
; GFX9-LABEL: shuffle_v4f16_2345:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX9-NEXT: global_load_dwordx2 v[1:2], v[2:3], off
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_mov_b32_e32 v0, v5
+; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
+; GFX9-NEXT: global_load_dword v1, v[2:3], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
@@ -256,10 +250,9 @@ define <4 x half> @shuffle_v4f16_2367(<4 x half> addrspace(1)* %arg0, <4 x half>
; GFX9-LABEL: shuffle_v4f16_2367:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off
+; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
+; GFX9-NEXT: global_load_dword v1, v[2:3], off offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
@@ -271,10 +264,11 @@ define <4 x half> @shuffle_v4f16_4501(<4 x half> addrspace(1)* %arg0, <4 x half>
; GFX9-LABEL: shuffle_v4f16_4501:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off
+; GFX9-NEXT: global_load_dword v2, v[2:3], off
+; GFX9-NEXT: global_load_dword v1, v[0:1], off
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_mov_b32_e32 v0, v2
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, v4
; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
@@ -286,10 +280,11 @@ define <4 x half> @shuffle_v4f16_4523(<4 x half> addrspace(1)* %arg0, <4 x half>
; GFX9-LABEL: shuffle_v4f16_4523:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_load_dword v2, v[2:3], off
+; GFX9-NEXT: global_load_dword v1, v[0:1], off offset:4
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
@@ -301,7 +296,7 @@ define <4 x half> @shuffle_v4f16_4545(<4 x half> addrspace(1)* %arg0, <4 x half>
; GFX9-LABEL: shuffle_v4f16_4545:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off
+; GFX9-NEXT: global_load_dword v0, v[2:3], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -328,11 +323,11 @@ define <4 x half> @shuffle_v4f16_6701(<4 x half> addrspace(1)* %arg0, <4 x half>
; GFX9-LABEL: shuffle_v4f16_6701:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off
+; GFX9-NEXT: global_load_dword v2, v[2:3], off offset:4
+; GFX9-NEXT: global_load_dword v1, v[0:1], off
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_mov_b32_e32 v0, v2
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v1, v4
; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
@@ -344,10 +339,11 @@ define <4 x half> @shuffle_v4f16_6723(<4 x half> addrspace(1)* %arg0, <4 x half>
; GFX9-LABEL: shuffle_v4f16_6723:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
+; GFX9-NEXT: global_load_dword v2, v[2:3], off offset:4
+; GFX9-NEXT: global_load_dword v1, v[0:1], off offset:4
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_mov_b32_e32 v0, v2
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
@@ -373,9 +369,9 @@ define <4 x half> @shuffle_v4f16_6767(<4 x half> addrspace(1)* %arg0, <4 x half>
; GFX9-LABEL: shuffle_v4f16_6767:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off
+; GFX9-NEXT: global_load_dword v0, v[2:3], off offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v1, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
@@ -388,13 +384,12 @@ define <4 x half> @shuffle_v4f16_2356(<4 x half> addrspace(1)* %arg0, <4 x half>
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
-; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff
+; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
+; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v0
+; GFX9-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
@@ -407,11 +402,12 @@ define <4 x half> @shuffle_v4f16_5623(<4 x half> addrspace(1)* %arg0, <4 x half>
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v[0:1], off offset:4
; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff
+; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
@@ -485,13 +481,12 @@ define <4 x i16> @shuffle_v4i16_2356(<4 x i16> addrspace(1)* %arg0, <4 x i16> ad
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
-; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff
+; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
+; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v0
+; GFX9-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0
%val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1
@@ -503,11 +498,9 @@ define <4 x i16> @shuffle_v4i16_0167(<4 x i16> addrspace(1)* %arg0, <4 x i16> ad
; GFX9-LABEL: shuffle_v4i16_0167:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[1:2], v[2:3], off
+; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: global_load_dword v1, v[2:3], off offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0
%val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1
@@ -590,12 +583,11 @@ define <4 x half> @shuffle_v4f16_2333(<4 x half> addrspace(1)* %arg0, <4 x half>
; GFX9-LABEL: shuffle_v4f16_2333:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off
+; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v0
-; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1
-; GFX9-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v1
+; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
@@ -607,12 +599,11 @@ define <4 x half> @shuffle_v4f16_6667(<4 x half> addrspace(1)* %arg0, <4 x half>
; GFX9-LABEL: shuffle_v4f16_6667:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off
+; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v2
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v0
-; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1
-; GFX9-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v1
+; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
diff --git a/llvm/test/CodeGen/ARM/vdup.ll b/llvm/test/CodeGen/ARM/vdup.ll
index 74ee4913b5ed..8ddbd8539bc7 100644
--- a/llvm/test/CodeGen/ARM/vdup.ll
+++ b/llvm/test/CodeGen/ARM/vdup.ll
@@ -429,8 +429,8 @@ define <4 x i32> @tduplane(<4 x i32> %invec) {
define <2 x float> @check_f32(<4 x float> %v) nounwind {
; CHECK-LABEL: check_f32:
; CHECK: @ %bb.0:
-; CHECK-NEXT: vmov d17, r2, r3
-; CHECK-NEXT: vdup.32 d16, d17[1]
+; CHECK-NEXT: vmov d16, r2, r3
+; CHECK-NEXT: vdup.32 d16, d16[1]
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: mov pc, lr
%x = extractelement <4 x float> %v, i32 3
@@ -442,8 +442,8 @@ define <2 x float> @check_f32(<4 x float> %v) nounwind {
define <2 x i32> @check_i32(<4 x i32> %v) nounwind {
; CHECK-LABEL: check_i32:
; CHECK: @ %bb.0:
-; CHECK-NEXT: vmov d17, r2, r3
-; CHECK-NEXT: vdup.32 d16, d17[1]
+; CHECK-NEXT: vmov d16, r2, r3
+; CHECK-NEXT: vdup.32 d16, d16[1]
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: mov pc, lr
%x = extractelement <4 x i32> %v, i32 3
diff --git a/llvm/test/CodeGen/ARM/vext.ll b/llvm/test/CodeGen/ARM/vext.ll
index c00bc41c25d5..554588fcc8e9 100644
--- a/llvm/test/CodeGen/ARM/vext.ll
+++ b/llvm/test/CodeGen/ARM/vext.ll
@@ -183,10 +183,10 @@ define <4 x i16> @test_interleaved(<8 x i16>* %A, <8 x i16>* %B) nounwind {
; CHECK: @ %bb.0:
; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
; CHECK-NEXT: vext.16 d16, d16, d17, #3
-; CHECK-NEXT: vorr d17, d16, d16
-; CHECK-NEXT: vld1.64 {d18, d19}, [r1]
-; CHECK-NEXT: vuzp.16 d16, d17
-; CHECK-NEXT: vzip.16 d16, d18
+; CHECK-NEXT: vorr d18, d16, d16
+; CHECK-NEXT: vldr d17, [r1]
+; CHECK-NEXT: vuzp.16 d16, d18
+; CHECK-NEXT: vzip.16 d16, d17
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: mov pc, lr
%tmp1 = load <8 x i16>, <8 x i16>* %A
@@ -216,17 +216,15 @@ define <4 x i16> @test_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind {
define <4 x i16> @test_multisource(<32 x i16>* %B) nounwind {
; CHECK-LABEL: test_multisource:
; CHECK: @ %bb.0:
+; CHECK-NEXT: vldr d18, [r0, #32]
; CHECK-NEXT: mov r1, r0
-; CHECK-NEXT: add r2, r0, #48
-; CHECK-NEXT: add r0, r0, #32
+; CHECK-NEXT: vorr d22, d18, d18
; CHECK-NEXT: vld1.16 {d16, d17}, [r1:128]!
-; CHECK-NEXT: vld1.64 {d20, d21}, [r0:128]
-; CHECK-NEXT: vorr d24, d20, d20
-; CHECK-NEXT: vld1.64 {d18, d19}, [r2:128]
-; CHECK-NEXT: vld1.64 {d22, d23}, [r1:128]
-; CHECK-NEXT: vzip.16 d24, d18
-; CHECK-NEXT: vtrn.16 q8, q11
-; CHECK-NEXT: vext.16 d18, d20, d24, #2
+; CHECK-NEXT: vldr d19, [r0, #48]
+; CHECK-NEXT: vld1.64 {d20, d21}, [r1:128]
+; CHECK-NEXT: vzip.16 d22, d19
+; CHECK-NEXT: vtrn.16 q8, q10
+; CHECK-NEXT: vext.16 d18, d18, d22, #2
; CHECK-NEXT: vext.16 d16, d18, d16, #2
; CHECK-NEXT: vext.16 d16, d16, d16, #2
; CHECK-NEXT: vmov r0, r1, d16
diff --git a/llvm/test/CodeGen/ARM/vpadd.ll b/llvm/test/CodeGen/ARM/vpadd.ll
index 72c3da298cf3..f5c0a4109e15 100644
--- a/llvm/test/CodeGen/ARM/vpadd.ll
+++ b/llvm/test/CodeGen/ARM/vpadd.ll
@@ -285,11 +285,11 @@ define void @addCombineToVPADDLq_s8(<16 x i8> *%cbcr, <8 x i16> *%X) nounwind ss
define void @addCombineToVPADDL_s8(<16 x i8> *%cbcr, <4 x i16> *%X) nounwind ssp {
; CHECK-LABEL: addCombineToVPADDL_s8:
; CHECK: @ %bb.0:
-; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
-; CHECK-NEXT: vext.8 d18, d16, d16, #1
+; CHECK-NEXT: vldr d16, [r0]
+; CHECK-NEXT: vext.8 d17, d16, d16, #1
; CHECK-NEXT: vshl.i16 d16, d16, #8
-; CHECK-NEXT: vshl.i16 d18, d18, #8
-; CHECK-NEXT: vshr.s16 d17, d18, #8
+; CHECK-NEXT: vshl.i16 d17, d17, #8
+; CHECK-NEXT: vshr.s16 d17, d17, #8
; CHECK-NEXT: vsra.s16 d17, d16, #8
; CHECK-NEXT: vstr d17, [r1]
; CHECK-NEXT: mov pc, lr
@@ -347,11 +347,11 @@ define void @addCombineToVPADDLq_u8_early_zext(<16 x i8> *%cbcr, <8 x i16> *%X)
define void @addCombineToVPADDL_u8(<16 x i8> *%cbcr, <4 x i16> *%X) nounwind ssp {
; CHECK-LABEL: addCombineToVPADDL_u8:
; CHECK: @ %bb.0:
-; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
-; CHECK-NEXT: vext.8 d18, d16, d16, #1
+; CHECK-NEXT: vldr d16, [r0]
+; CHECK-NEXT: vext.8 d17, d16, d16, #1
; CHECK-NEXT: vbic.i16 d16, #0xff00
-; CHECK-NEXT: vbic.i16 d18, #0xff00
-; CHECK-NEXT: vadd.i16 d16, d18, d16
+; CHECK-NEXT: vbic.i16 d17, #0xff00
+; CHECK-NEXT: vadd.i16 d16, d17, d16
; CHECK-NEXT: vstr d16, [r1]
; CHECK-NEXT: mov pc, lr
%tmp = load <16 x i8>, <16 x i8>* %cbcr
@@ -368,7 +368,7 @@ define void @addCombineToVPADDL_u8(<16 x i8> *%cbcr, <4 x i16> *%X) nounwind ssp
define void @addCombineToVPADDL_u8_early_zext(<16 x i8> *%cbcr, <4 x i16> *%X) nounwind ssp {
; CHECK-LABEL: addCombineToVPADDL_u8_early_zext:
; CHECK: @ %bb.0:
-; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT: vldr d16, [r0]
; CHECK-NEXT: vmovl.u8 q8, d16
; CHECK-NEXT: vpadd.i16 d16, d16, d17
; CHECK-NEXT: vstr d16, [r1]
diff --git a/llvm/test/CodeGen/ARM/vuzp.ll b/llvm/test/CodeGen/ARM/vuzp.ll
index 6a48f0cf4498..a5d6a6276627 100644
--- a/llvm/test/CodeGen/ARM/vuzp.ll
+++ b/llvm/test/CodeGen/ARM/vuzp.ll
@@ -522,12 +522,11 @@ define <10 x i8> @vuzp_wide_type(<10 x i8> %tr0, <10 x i8> %tr1,
define %struct.uint8x8x2_t @vuzp_extract_subvector(<16 x i8> %t) #0 {
; CHECK-LABEL: vuzp_extract_subvector:
; CHECK: @ %bb.0:
-; CHECK-NEXT: vmov d17, r2, r3
-; CHECK-NEXT: vmov d16, r0, r1
-; CHECK-NEXT: vorr d18, d17, d17
-; CHECK-NEXT: vuzp.8 d16, d18
-; CHECK-NEXT: vmov r0, r1, d16
-; CHECK-NEXT: vmov r2, r3, d18
+; CHECK-NEXT: vmov d16, r2, r3
+; CHECK-NEXT: vmov d17, r0, r1
+; CHECK-NEXT: vuzp.8 d17, d16
+; CHECK-NEXT: vmov r0, r1, d17
+; CHECK-NEXT: vmov r2, r3, d16
; CHECK-NEXT: mov pc, lr
%vuzp.i = shufflevector <16 x i8> %t, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll
index 4e80f8f92d83..70c59133f85a 100644
--- a/llvm/test/CodeGen/X86/sse41.ll
+++ b/llvm/test/CodeGen/X86/sse41.ll
@@ -97,9 +97,8 @@ define <2 x i64> @pmovzxbq_1() nounwind {
; X86-AVX512: ## %bb.0: ## %entry
; X86-AVX512-NEXT: movl L_g16$non_lazy_ptr, %eax ## encoding: [0xa1,A,A,A,A]
; X86-AVX512-NEXT: ## fixup A - offset: 1, value: L_g16$non_lazy_ptr, kind: FK_Data_4
-; X86-AVX512-NEXT: vpbroadcastw (%eax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x79,0x00]
-; X86-AVX512-NEXT: vpmovzxbq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x32,0xc0]
-; X86-AVX512-NEXT: ## xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; X86-AVX512-NEXT: vpmovzxbq (%eax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x32,0x00]
+; X86-AVX512-NEXT: ## xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
; X86-AVX512-NEXT: retl ## encoding: [0xc3]
;
; X64-SSE-LABEL: pmovzxbq_1:
@@ -122,9 +121,8 @@ define <2 x i64> @pmovzxbq_1() nounwind {
; X64-AVX512: ## %bb.0: ## %entry
; X64-AVX512-NEXT: movq _g16@{{.*}}(%rip), %rax ## encoding: [0x48,0x8b,0x05,A,A,A,A]
; X64-AVX512-NEXT: ## fixup A - offset: 3, value: _g16 at GOTPCREL-4, kind: reloc_riprel_4byte_movq_load
-; X64-AVX512-NEXT: vpbroadcastw (%rax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x79,0x00]
-; X64-AVX512-NEXT: vpmovzxbq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x32,0xc0]
-; X64-AVX512-NEXT: ## xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; X64-AVX512-NEXT: vpmovzxbq (%rax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x32,0x00]
+; X64-AVX512-NEXT: ## xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
; X64-AVX512-NEXT: retq ## encoding: [0xc3]
entry:
%0 = load i16, i16* @g16, align 2 ; <i16> [#uses=1]
More information about the llvm-commits
mailing list