[llvm-branch-commits] [llvm] 38ebc1a - [VectorCombine] optimize alignment for load transform
Sanjay Patel via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Wed Dec 16 12:31:38 PST 2020
Author: Sanjay Patel
Date: 2020-12-16T15:25:45-05:00
New Revision: 38ebc1a13dc8ce41917d66918b319d793dc2fb02
URL: https://github.com/llvm/llvm-project/commit/38ebc1a13dc8ce41917d66918b319d793dc2fb02
DIFF: https://github.com/llvm/llvm-project/commit/38ebc1a13dc8ce41917d66918b319d793dc2fb02.diff
LOG: [VectorCombine] optimize alignment for load transform
Here's another minimal step suggested by D93229 / D93397 .
(I'm trying to be extra careful in these changes because
load transforms are easy to get wrong.)
We can optimistically choose the greater alignment of a
load and its pointer operand. As the test diffs show, this
can improve what would have been unaligned vector loads
into aligned loads.
When we enhance with gep offsets, we will need to adjust
the alignment calculation to include that offset.
Differential Revision: https://reviews.llvm.org/D93406
Added:
Modified:
llvm/lib/Transforms/Vectorize/VectorCombine.cpp
llvm/test/Transforms/VectorCombine/X86/load.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 086169c55c8d..8e341619dcf4 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -143,7 +143,8 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
return false;
// Original pattern: insertelt undef, load [free casts of] PtrOp, 0
- Align Alignment = Load->getAlign();
+ // Use the greater of the alignment on the load or its source pointer.
+ Align Alignment = std::max(SrcPtr->getPointerAlignment(DL), Load->getAlign());
Type *LoadTy = Load->getType();
int OldCost = TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS);
APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0);
diff --git a/llvm/test/Transforms/VectorCombine/X86/load.ll b/llvm/test/Transforms/VectorCombine/X86/load.ll
index f5a962dd7cfe..e8ba175b0235 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load.ll
@@ -175,7 +175,7 @@ define double @larger_fp_scalar_256bit_vec(<8 x float>* align 32 dereferenceable
define <4 x float> @load_f32_insert_v4f32(float* align 16 dereferenceable(16) %p) {
; CHECK-LABEL: @load_f32_insert_v4f32(
; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
-; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16
; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: ret <4 x float> [[R]]
;
@@ -201,7 +201,7 @@ define <4 x float> @casted_load_f32_insert_v4f32(<4 x float>* align 4 dereferenc
define <4 x i32> @load_i32_insert_v4i32(i32* align 16 dereferenceable(16) %p) {
; CHECK-LABEL: @load_i32_insert_v4i32(
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to <4 x i32>*
-; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 16
; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: ret <4 x i32> [[R]]
;
@@ -434,7 +434,7 @@ define <4 x float> @load_f32_insert_v4f32_deref(float* align 4 dereferenceable(1
define <8 x i32> @load_i32_insert_v8i32(i32* align 16 dereferenceable(16) %p) {
; CHECK-LABEL: @load_i32_insert_v8i32(
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to <4 x i32>*
-; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 16
; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: ret <8 x i32> [[R]]
;
@@ -458,7 +458,7 @@ define <8 x i32> @casted_load_i32_insert_v8i32(<4 x i32>* align 4 dereferenceabl
define <16 x float> @load_f32_insert_v16f32(float* align 16 dereferenceable(16) %p) {
; CHECK-LABEL: @load_f32_insert_v16f32(
; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
-; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16
; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: ret <16 x float> [[R]]
;
@@ -470,7 +470,7 @@ define <16 x float> @load_f32_insert_v16f32(float* align 16 dereferenceable(16)
define <2 x float> @load_f32_insert_v2f32(float* align 16 dereferenceable(16) %p) {
; CHECK-LABEL: @load_f32_insert_v2f32(
; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
-; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16
; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <2 x i32> <i32 0, i32 undef>
; CHECK-NEXT: ret <2 x float> [[R]]
;
@@ -525,7 +525,7 @@ define void @PR47558_multiple_use_load(<2 x float>* nocapture nonnull %resultptr
define <4 x float> @load_v2f32_extract_insert_v4f32(<2 x float>* align 16 dereferenceable(16) %p) {
; CHECK-LABEL: @load_v2f32_extract_insert_v4f32(
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float>* [[P:%.*]] to <4 x float>*
-; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16
; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: ret <4 x float> [[R]]
;
@@ -538,7 +538,7 @@ define <4 x float> @load_v2f32_extract_insert_v4f32(<2 x float>* align 16 derefe
define <4 x float> @load_v8f32_extract_insert_v4f32(<8 x float>* align 16 dereferenceable(16) %p) {
; CHECK-LABEL: @load_v8f32_extract_insert_v4f32(
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x float>* [[P:%.*]] to <4 x float>*
-; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16
; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: ret <4 x float> [[R]]
;
More information about the llvm-branch-commits
mailing list