[llvm] [IRBuilder] Limit created masked load/store alignment to valid 32bit values (PR #73647)

Tue Nov 28 05:59:11 PST 2023

https://github.com/davemgreen created https://github.com/llvm/llvm-project/pull/73647

In rare situations, mostly when using a null base pointer, the returned Alignment can be very high. So high it overflows a 32bit value and we end up with a masked load/store with zero alignment, which the verifier does not like.

This patch limits the alignment passed to created masked.load and masked.store to be the maximum valid alignment that can fix in a 32bit integer.

>From 05bb66c45e3c1501b12b58a7e0a111c66cfc0448 Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Tue, 28 Nov 2023 13:51:11 +0000
Subject: [PATCH] [IRBuilder] Limit created masked load/store alignment to
 valid 32bit values.

In rare situations, mostly when using a null base pointer, the returned
Alignment can be very high. So high it overflows a 32bit value and we end up
with a masked load/store with zero alignment, which the verifier does not like.

This patch limits the alignment passed to created masked.load and masked.store
to be the maximum valid alignment that can fix in a 32bit integer.
---
 llvm/lib/IR/IRBuilder.cpp                     | 12 ++++--
 .../AArch64/sve-intrinsic-loadstore.ll        | 42 +++++++++++++++++++
 2 files changed, 50 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index b09b80f95871a1c..dccf46bae1cf7f6 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -584,7 +584,8 @@ CallInst *IRBuilderBase::CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment,
   if (!PassThru)
     PassThru = PoisonValue::get(Ty);
   Type *OverloadedTypes[] = { Ty, PtrTy };
-  Value *Ops[] = {Ptr, getInt32(Alignment.value()), Mask, PassThru};
+  unsigned Align32 = std::min<uint64_t>(Alignment.value(), 0x80000000ULL);
+  Value *Ops[] = {Ptr, getInt32(Align32), Mask, PassThru};
   return CreateMaskedIntrinsic(Intrinsic::masked_load, Ops,
                                OverloadedTypes, Name);
 }
@@ -602,7 +603,8 @@ CallInst *IRBuilderBase::CreateMaskedStore(Value *Val, Value *Ptr,
   assert(DataTy->isVectorTy() && "Val should be a vector");
   assert(Mask && "Mask should not be all-ones (null)");
   Type *OverloadedTypes[] = { DataTy, PtrTy };
-  Value *Ops[] = {Val, Ptr, getInt32(Alignment.value()), Mask};
+  unsigned Align32 = std::min<uint64_t>(Alignment.value(), 0x80000000ULL);
+  Value *Ops[] = {Val, Ptr, getInt32(Align32), Mask};
   return CreateMaskedIntrinsic(Intrinsic::masked_store, Ops, OverloadedTypes);
 }
 
@@ -643,7 +645,8 @@ CallInst *IRBuilderBase::CreateMaskedGather(Type *Ty, Value *Ptrs,
     PassThru = PoisonValue::get(Ty);
 
   Type *OverloadedTypes[] = {Ty, PtrsTy};
-  Value *Ops[] = {Ptrs, getInt32(Alignment.value()), Mask, PassThru};
+  unsigned Align32 = std::min<uint64_t>(Alignment.value(), 0x80000000ULL);
+  Value *Ops[] = {Ptrs, getInt32(Align32), Mask, PassThru};
 
   // We specify only one type when we create this intrinsic. Types of other
   // arguments are derived from this type.
@@ -668,7 +671,8 @@ CallInst *IRBuilderBase::CreateMaskedScatter(Value *Data, Value *Ptrs,
     Mask = getAllOnesMask(NumElts);
 
   Type *OverloadedTypes[] = {DataTy, PtrsTy};
-  Value *Ops[] = {Data, Ptrs, getInt32(Alignment.value()), Mask};
+  unsigned Align32 = std::min<uint64_t>(Alignment.value(), 0x80000000ULL);
+  Value *Ops[] = {Data, Ptrs, getInt32(Align32), Mask};
 
   // We specify only one type when we create this intrinsic. Types of other
   // arguments are derived from this type.
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-loadstore.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-loadstore.ll
index c67662f872503e4..979653f8a2f7520 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-loadstore.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-loadstore.ll
@@ -99,6 +99,48 @@ define void @combine_st1_masked_casted_predicate(<vscale x 8 x i16> %vec, ptr %p
   ret void
 }
 
+define <vscale x 4 x i32> @combine_ld1_superalign(ptr %ptr) #0 {
+; CHECK-LABEL: @combine_ld1_superalign(
+; CHECK-NEXT:    store i1 true, ptr poison, align 1
+; CHECK-NEXT:    ret <vscale x 4 x i32> poison
+;
+  %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %2 = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4 x i1> %1, ptr null)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @combine_ld1_masked_superalign(ptr %ptr) #0 {
+; CHECK-LABEL: @combine_ld1_masked_superalign(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 16)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr null, i32 -2147483648, <vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
+;
+  %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 16)
+  %2 = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4 x i1> %1, ptr null)
+  ret <vscale x 4 x i32> %2
+}
+
+define void @combine_st1_superalign(<vscale x 4 x i32> %vec, ptr %ptr) #0 {
+; CHECK-LABEL: @combine_st1_superalign(
+; CHECK-NEXT:    store <vscale x 4 x i32> poison, ptr null, align 16
+; CHECK-NEXT:    ret void
+;
+  %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  call void @llvm.aarch64.sve.st1.nxv4i32(<vscale x 4 x i32> %vec, <vscale x 4 x i1> %1, ptr null)
+  ret void
+}
+
+define void @combine_st1_masked_superalign(<vscale x 4 x i32> %vec, ptr %ptr) #0 {
+; CHECK-LABEL: @combine_st1_masked_superalign(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 16)
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[VEC:%.*]], ptr null, i32 -2147483648, <vscale x 4 x i1> [[TMP1]])
+; CHECK-NEXT:    ret void
+;
+  %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 16)
+  call void @llvm.aarch64.sve.st1.nxv4i32(<vscale x 4 x i32> %vec, <vscale x 4 x i1> %1, ptr null)
+  ret void
+}
+
 declare <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1>)
 declare <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1>)