[llvm] 3e76d01 - [AArch64][CodeGen] Allow vectors larger than hardware support to use SVE's load zero/sign-extend for fixed vectors.
Dinar Temirbulatov via llvm-commits
llvm-commits at lists.llvm.org
Wed Apr 19 07:18:37 PDT 2023
Author: Dinar Temirbulatov
Date: 2023-04-19T14:17:32Z
New Revision: 3e76d012d174a2e33564a6d8476c7f88c7d1b4b8
URL: https://github.com/llvm/llvm-project/commit/3e76d012d174a2e33564a6d8476c7f88c7d1b4b8
DIFF: https://github.com/llvm/llvm-project/commit/3e76d012d174a2e33564a6d8476c7f88c7d1b4b8.diff
LOG: [AArch64][CodeGen] Allow vectors larger than hardware support to use SVE's load zero/sign-extend for fixed vectors.
Prefer to fold LOAD + SIGN/ZEROEXTEND to SVE load and sign extend instructions
for fixed-length-vectors even if a type is not representable on a hardware.
Differential Revision: https://reviews.llvm.org/D147533
Added:
Modified:
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll
llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 514712c737fe..3939f4de416b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -5228,9 +5228,7 @@ bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(EVT IndexVT,
bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
return ExtVal.getValueType().isScalableVector() ||
- useSVEForFixedLengthVectorVT(
- ExtVal.getValueType(),
- /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors());
+ Subtarget->useSVEForFixedLengthVectors();
}
unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll
index 2aec28e783a3..d7bd08628ff5 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll
@@ -73,13 +73,10 @@ define <32 x i32> @load_zext_v32i16i32(ptr %ap) vscale_range(8,0) #0 {
define <64 x i32> @load_zext_v64i16i32(ptr %ap) #0 {
; VBITS_GE_1024-LABEL: load_zext_v64i16i32:
; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.h, vl64
-; VBITS_GE_1024-NEXT: mov x9, #32
-; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_1024-NEXT: mov x9, #32 // =0x20
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT: uunpklo z1.s, z0.h
-; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #64
-; VBITS_GE_1024-NEXT: uunpklo z0.s, z0.h
+; VBITS_GE_1024-NEXT: ld1h { z0.s }, p0/z, [x0, x9, lsl #1]
+; VBITS_GE_1024-NEXT: ld1h { z1.s }, p0/z, [x0]
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
; VBITS_GE_1024-NEXT: st1w { z1.s }, p0, [x8]
; VBITS_GE_1024-NEXT: ret
@@ -145,13 +142,10 @@ define <32 x i32> @load_sext_v32i16i32(ptr %ap) vscale_range(8,0) #0 {
define <64 x i32> @load_sext_v64i16i32(ptr %ap) #0 {
; VBITS_GE_1024-LABEL: load_sext_v64i16i32:
; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.h, vl64
-; VBITS_GE_1024-NEXT: mov x9, #32
-; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_1024-NEXT: mov x9, #32 // =0x20
; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT: sunpklo z1.s, z0.h
-; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #64
-; VBITS_GE_1024-NEXT: sunpklo z0.s, z0.h
+; VBITS_GE_1024-NEXT: ld1sh { z0.s }, p0/z, [x0, x9, lsl #1]
+; VBITS_GE_1024-NEXT: ld1sh { z1.s }, p0/z, [x0]
; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x8, x9, lsl #2]
; VBITS_GE_1024-NEXT: st1w { z1.s }, p0, [x8]
; VBITS_GE_1024-NEXT: ret
@@ -170,17 +164,11 @@ define <64 x i32> @load_sext_v64i16i32(ptr %ap) #0 {
define <32 x i64> @load_zext_v32i8i64(ptr %ap) #0 {
; VBITS_GE_1024-LABEL: load_zext_v32i8i64:
; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.b, vl32
-; VBITS_GE_1024-NEXT: mov x9, #16
-; VBITS_GE_1024-NEXT: ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_1024-NEXT: mov w9, #16 // =0x10
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: uunpklo z1.h, z0.b
-; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_1024-NEXT: uunpklo z0.h, z0.b
-; VBITS_GE_1024-NEXT: uunpklo z1.s, z1.h
-; VBITS_GE_1024-NEXT: uunpklo z0.s, z0.h
-; VBITS_GE_1024-NEXT: uunpklo z1.d, z1.s
-; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s
+; VBITS_GE_1024-NEXT: ld1b { z0.d }, p0/z, [x0, x9]
+; VBITS_GE_1024-NEXT: ld1b { z1.d }, p0/z, [x0]
+; VBITS_GE_1024-NEXT: mov x9, #16 // =0x10
; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x8]
; VBITS_GE_1024-NEXT: ret
@@ -199,17 +187,11 @@ define <32 x i64> @load_zext_v32i8i64(ptr %ap) #0 {
define <32 x i64> @load_sext_v32i8i64(ptr %ap) #0 {
; VBITS_GE_1024-LABEL: load_sext_v32i8i64:
; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.b, vl32
-; VBITS_GE_1024-NEXT: mov x9, #16
-; VBITS_GE_1024-NEXT: ld1b { z0.b }, p0/z, [x0]
+; VBITS_GE_1024-NEXT: mov w9, #16 // =0x10
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: sunpklo z1.h, z0.b
-; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_1024-NEXT: sunpklo z0.h, z0.b
-; VBITS_GE_1024-NEXT: sunpklo z1.s, z1.h
-; VBITS_GE_1024-NEXT: sunpklo z0.s, z0.h
-; VBITS_GE_1024-NEXT: sunpklo z1.d, z1.s
-; VBITS_GE_1024-NEXT: sunpklo z0.d, z0.s
+; VBITS_GE_1024-NEXT: ld1sb { z0.d }, p0/z, [x0, x9]
+; VBITS_GE_1024-NEXT: ld1sb { z1.d }, p0/z, [x0]
+; VBITS_GE_1024-NEXT: mov x9, #16 // =0x10
; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x8]
; VBITS_GE_1024-NEXT: ret
@@ -228,15 +210,10 @@ define <32 x i64> @load_sext_v32i8i64(ptr %ap) #0 {
define <32 x i64> @load_zext_v32i16i64(ptr %ap) #0 {
; VBITS_GE_1024-LABEL: load_zext_v32i16i64:
; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.h, vl32
-; VBITS_GE_1024-NEXT: mov x9, #16
-; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_1024-NEXT: mov x9, #16 // =0x10
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: uunpklo z1.s, z0.h
-; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #32
-; VBITS_GE_1024-NEXT: uunpklo z0.s, z0.h
-; VBITS_GE_1024-NEXT: uunpklo z1.d, z1.s
-; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s
+; VBITS_GE_1024-NEXT: ld1h { z0.d }, p0/z, [x0, x9, lsl #1]
+; VBITS_GE_1024-NEXT: ld1h { z1.d }, p0/z, [x0]
; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x8]
; VBITS_GE_1024-NEXT: ret
@@ -255,15 +232,10 @@ define <32 x i64> @load_zext_v32i16i64(ptr %ap) #0 {
define <32 x i64> @load_sext_v32i16i64(ptr %ap) #0 {
; VBITS_GE_1024-LABEL: load_sext_v32i16i64:
; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.h, vl32
-; VBITS_GE_1024-NEXT: mov x9, #16
-; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0]
+; VBITS_GE_1024-NEXT: mov x9, #16 // =0x10
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: sunpklo z1.s, z0.h
-; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #32
-; VBITS_GE_1024-NEXT: sunpklo z0.s, z0.h
-; VBITS_GE_1024-NEXT: sunpklo z1.d, z1.s
-; VBITS_GE_1024-NEXT: sunpklo z0.d, z0.s
+; VBITS_GE_1024-NEXT: ld1sh { z0.d }, p0/z, [x0, x9, lsl #1]
+; VBITS_GE_1024-NEXT: ld1sh { z1.d }, p0/z, [x0]
; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x8]
; VBITS_GE_1024-NEXT: ret
@@ -282,13 +254,10 @@ define <32 x i64> @load_sext_v32i16i64(ptr %ap) #0 {
define <32 x i64> @load_zext_v32i32i64(ptr %ap) #0 {
; VBITS_GE_1024-LABEL: load_zext_v32i32i64:
; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT: mov x9, #16
-; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_1024-NEXT: mov x9, #16 // =0x10
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: uunpklo z1.d, z0.s
-; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #64
-; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s
+; VBITS_GE_1024-NEXT: ld1w { z0.d }, p0/z, [x0, x9, lsl #2]
+; VBITS_GE_1024-NEXT: ld1w { z1.d }, p0/z, [x0]
; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x8]
; VBITS_GE_1024-NEXT: ret
@@ -307,13 +276,10 @@ define <32 x i64> @load_zext_v32i32i64(ptr %ap) #0 {
define <32 x i64> @load_sext_v32i32i64(ptr %ap) #0 {
; VBITS_GE_1024-LABEL: load_sext_v32i32i64:
; VBITS_GE_1024: // %bb.0:
-; VBITS_GE_1024-NEXT: ptrue p0.s, vl32
-; VBITS_GE_1024-NEXT: mov x9, #16
-; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0]
+; VBITS_GE_1024-NEXT: mov x9, #16 // =0x10
; VBITS_GE_1024-NEXT: ptrue p0.d, vl16
-; VBITS_GE_1024-NEXT: sunpklo z1.d, z0.s
-; VBITS_GE_1024-NEXT: ext z0.b, z0.b, z0.b, #64
-; VBITS_GE_1024-NEXT: sunpklo z0.d, z0.s
+; VBITS_GE_1024-NEXT: ld1sw { z0.d }, p0/z, [x0, x9, lsl #2]
+; VBITS_GE_1024-NEXT: ld1sw { z1.d }, p0/z, [x0]
; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x8, x9, lsl #3]
; VBITS_GE_1024-NEXT: st1d { z1.d }, p0, [x8]
; VBITS_GE_1024-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
index 94b4aad294f0..cefe83639e14 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
@@ -63,16 +63,14 @@ define <2 x i256> @load_zext_v2i64i256(ptr %ap) #0 {
define <16 x i32> @load_sext_v16i8i32(ptr %ap) #0 {
; CHECK-LABEL: load_sext_v16i8i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr q1, [x0]
-; CHECK-NEXT: sunpklo z3.h, z1.b
-; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT: sunpklo z4.h, z1.b
-; CHECK-NEXT: sunpklo z0.s, z3.h
-; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8
-; CHECK-NEXT: sunpklo z2.s, z4.h
-; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8
-; CHECK-NEXT: sunpklo z1.s, z3.h
-; CHECK-NEXT: sunpklo z3.s, z4.h
+; CHECK-NEXT: mov w8, #4 // =0x4
+; CHECK-NEXT: mov w9, #8 // =0x8
+; CHECK-NEXT: mov w10, #12 // =0xc
+; CHECK-NEXT: ptrue p0.s, vl4
+; CHECK-NEXT: ld1sb { z1.s }, p0/z, [x0, x8]
+; CHECK-NEXT: ld1sb { z2.s }, p0/z, [x0, x9]
+; CHECK-NEXT: ld1sb { z3.s }, p0/z, [x0, x10]
+; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0]
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1
; CHECK-NEXT: // kill: def $q2 killed $q2 killed $z2
@@ -86,10 +84,10 @@ define <16 x i32> @load_sext_v16i8i32(ptr %ap) #0 {
define <8 x i32> @load_sext_v8i16i32(ptr %ap) #0 {
; CHECK-LABEL: load_sext_v8i16i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr q1, [x0]
-; CHECK-NEXT: sunpklo z0.s, z1.h
-; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT: sunpklo z1.s, z1.h
+; CHECK-NEXT: mov x8, #4 // =0x4
+; CHECK-NEXT: ptrue p0.s, vl4
+; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x0, x8, lsl #1]
+; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0]
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1
; CHECK-NEXT: ret
@@ -165,25 +163,22 @@ define <2 x i256> @load_sext_v2i64i256(ptr %ap) #0 {
define <16 x i64> @load_zext_v16i16i64(ptr %ap) #0 {
; CHECK-LABEL: load_zext_v16i16i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldp q1, q2, [x0]
-; CHECK-NEXT: uunpklo z3.s, z1.h
-; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8
-; CHECK-NEXT: uunpklo z7.s, z1.h
-; CHECK-NEXT: uunpklo z0.d, z3.s
-; CHECK-NEXT: uunpklo z5.s, z2.h
-; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8
-; CHECK-NEXT: uunpklo z16.s, z2.h
-; CHECK-NEXT: uunpklo z4.d, z5.s
-; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8
-; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8
-; CHECK-NEXT: uunpklo z2.d, z7.s
-; CHECK-NEXT: uunpklo z6.d, z16.s
-; CHECK-NEXT: ext z7.b, z7.b, z7.b, #8
-; CHECK-NEXT: ext z16.b, z16.b, z16.b, #8
-; CHECK-NEXT: uunpklo z1.d, z3.s
-; CHECK-NEXT: uunpklo z5.d, z5.s
-; CHECK-NEXT: uunpklo z3.d, z7.s
-; CHECK-NEXT: uunpklo z7.d, z16.s
+; CHECK-NEXT: mov x8, #2 // =0x2
+; CHECK-NEXT: mov x9, #4 // =0x4
+; CHECK-NEXT: ptrue p0.d, vl2
+; CHECK-NEXT: mov x10, #6 // =0x6
+; CHECK-NEXT: mov x11, #8 // =0x8
+; CHECK-NEXT: mov x12, #10 // =0xa
+; CHECK-NEXT: ld1h { z1.d }, p0/z, [x0, x8, lsl #1]
+; CHECK-NEXT: mov x8, #12 // =0xc
+; CHECK-NEXT: ld1h { z2.d }, p0/z, [x0, x9, lsl #1]
+; CHECK-NEXT: mov x9, #14 // =0xe
+; CHECK-NEXT: ld1h { z3.d }, p0/z, [x0, x10, lsl #1]
+; CHECK-NEXT: ld1h { z4.d }, p0/z, [x0, x11, lsl #1]
+; CHECK-NEXT: ld1h { z5.d }, p0/z, [x0, x12, lsl #1]
+; CHECK-NEXT: ld1h { z6.d }, p0/z, [x0, x8, lsl #1]
+; CHECK-NEXT: ld1h { z7.d }, p0/z, [x0, x9, lsl #1]
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0]
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1
; CHECK-NEXT: // kill: def $q2 killed $q2 killed $z2
More information about the llvm-commits
mailing list