[llvm] [GVN] Permit load PRE to happen in more cases (PR #76063)

David Sherwood via llvm-commits llvm-commits at lists.llvm.org
Wed Dec 20 07:22:24 PST 2023


https://github.com/david-arm created https://github.com/llvm/llvm-project/pull/76063

There is code in PerformLoadPRE that bails out if the number of
predecessors where we have to insert a load is greater than one:

  // If we need to insert new load in multiple predecessors, reject it.
  // FIXME: If we could restructure the CFG, we could make a common pred with
  // all the preds that don't have an available Load and insert a new load into
  // that one block.
  if (NumInsertPreds > 1)
      return false;

However, this is quite restrictive because NumInsertPreds is the sum

  unsigned NumInsertPreds = PredLoads.size() + CriticalEdgePredSplit.size();

where PredLoads and CriticalEdgePredSplit are the lists of available and
unavailable loads, respectively. Therefore, with just one unavailable
load we refuse to perform load PRE if there are other predecessors with
available loads. I've changed this check above to be less conservative:

  if (CriticalEdgePredSplit.size() > 1)
      return false;

As a result we can perform load PRE in many more cases. On the
AArch64 neoverse-v1 machine we see 1% gains in both SPEC2017 perlbench
and omnetpp benchmarks. On neoverse-n1 we see 1% and 3.5% gains in
perlbench and omnetpp, respectively. I only observed one regression with
exchange2, but this is related to function specialisation and will be
fixed separately.

>From cd752126133508efdcfbb402e8420b894b4ff694 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Mon, 18 Dec 2023 15:56:18 +0000
Subject: [PATCH 1/2] Add more GVN load PRE tests

---
 llvm/test/Transforms/GVN/PRE/load-pre-licm.ll | 83 +++++++++++++++++++
 1 file changed, 83 insertions(+)

diff --git a/llvm/test/Transforms/GVN/PRE/load-pre-licm.ll b/llvm/test/Transforms/GVN/PRE/load-pre-licm.ll
index c52f46b4f63ee9..1e3e2189cd53f5 100644
--- a/llvm/test/Transforms/GVN/PRE/load-pre-licm.ll
+++ b/llvm/test/Transforms/GVN/PRE/load-pre-licm.ll
@@ -282,3 +282,86 @@ header:
   call void @hold(i32 %v1)
   br label %header
 }
+
+
+; In block Z one load is unavailable (via entry block), whereas via other 2
+; predecessors the loads are available.
+define i8 @test7a(i1 %c1, ptr %a, i8 %i, i8 %j) {
+; CHECK-LABEL: @test7a(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C1:%.*]], label [[A:%.*]], label [[Z:%.*]]
+; CHECK:       A:
+; CHECK-NEXT:    [[PI:%.*]] = getelementptr i8, ptr [[A:%.*]], i8 [[I:%.*]]
+; CHECK-NEXT:    [[PJ:%.*]] = getelementptr i8, ptr [[A]], i8 [[J:%.*]]
+; CHECK-NEXT:    [[X:%.*]] = load i8, ptr [[PI]], align 1
+; CHECK-NEXT:    [[Y:%.*]] = load i8, ptr [[PJ]], align 1
+; CHECK-NEXT:    [[C2:%.*]] = icmp slt i8 [[X]], [[Y]]
+; CHECK-NEXT:    br i1 [[C2]], label [[Z]], label [[B:%.*]]
+; CHECK:       B:
+; CHECK-NEXT:    br label [[Z]]
+; CHECK:       Z:
+; CHECK-NEXT:    [[K:%.*]] = phi i8 [ [[I]], [[ENTRY:%.*]] ], [ [[I]], [[A]] ], [ [[J]], [[B]] ]
+; CHECK-NEXT:    [[PK:%.*]] = getelementptr i8, ptr [[A]], i8 [[K]]
+; CHECK-NEXT:    [[Z:%.*]] = load i8, ptr [[PK]], align 1
+; CHECK-NEXT:    ret i8 [[Z]]
+;
+entry:
+  br i1 %c1, label %A, label %Z
+
+A:
+  %pi = getelementptr i8, ptr %a, i8 %i
+  %pj = getelementptr i8, ptr %a, i8 %j
+  %x = load i8, ptr %pi
+  %y = load i8, ptr %pj
+  %c2 = icmp slt i8 %x, %y
+  br i1 %c2, label %Z, label %B
+
+B:
+  br label %Z
+
+Z:
+  %k = phi i8 [ %i, %entry ], [%i, %A], [ %j, %B ]
+  %pk = getelementptr i8, ptr %a, i8 %k
+  %z = load i8, ptr %pk
+  ret i8 %z
+}
+
+
+
+; In block Z two loads are unavailable (via entry and B blocks), whereas via
+; other predecessor (A) the load is available.
+define i8 @test7b(i1 %c1, ptr %a, i8 %i, i8 %j) {
+; CHECK-LABEL: @test7b(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C1:%.*]], label [[A:%.*]], label [[Z:%.*]]
+; CHECK:       A:
+; CHECK-NEXT:    [[PI:%.*]] = getelementptr i8, ptr [[A:%.*]], i8 [[I:%.*]]
+; CHECK-NEXT:    [[X:%.*]] = load i8, ptr [[PI]], align 1
+; CHECK-NEXT:    [[C2:%.*]] = icmp slt i8 [[X]], 3
+; CHECK-NEXT:    br i1 [[C2]], label [[Z]], label [[B:%.*]]
+; CHECK:       B:
+; CHECK-NEXT:    br label [[Z]]
+; CHECK:       Z:
+; CHECK-NEXT:    [[K:%.*]] = phi i8 [ [[I]], [[ENTRY:%.*]] ], [ [[I]], [[A]] ], [ [[J:%.*]], [[B]] ]
+; CHECK-NEXT:    [[PK:%.*]] = getelementptr i8, ptr [[A]], i8 [[K]]
+; CHECK-NEXT:    [[Z:%.*]] = load i8, ptr [[PK]], align 1
+; CHECK-NEXT:    ret i8 [[Z]]
+;
+entry:
+  br i1 %c1, label %A, label %Z
+
+A:
+  %pi = getelementptr i8, ptr %a, i8 %i
+  %x = load i8, ptr %pi
+  %c2 = icmp slt i8 %x, 3
+  br i1 %c2, label %Z, label %B
+
+B:
+  br label %Z
+
+Z:
+  %k = phi i8 [ %i, %entry ], [%i, %A], [ %j, %B ]
+  %pk = getelementptr i8, ptr %a, i8 %k
+  %z = load i8, ptr %pk
+  ret i8 %z
+}

>From 50e96813df15ca4323e8e1017af499fad1662d27 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Mon, 18 Dec 2023 11:50:09 +0000
Subject: [PATCH 2/2] [GVN] Permit load PRE to happen in more cases

There is code in PerformLoadPRE that bails out if the number of
predecessors where we have to insert a load is greater than one:

  // If we need to insert new load in multiple predecessors, reject it.
  // FIXME: If we could restructure the CFG, we could make a common pred with
  // all the preds that don't have an available Load and insert a new load into
  // that one block.
  if (NumInsertPreds > 1)
      return false;

However, this is quite restrictive because NumInsertPreds is the sum

  unsigned NumInsertPreds = PredLoads.size() + CriticalEdgePredSplit.size();

where PredLoads and CriticalEdgePredSplit are the lists of available and
unavailable loads, respectively. Therefore, with just one unavailable
load we refuse to perform load PRE if there are other predecessors with
available loads. I've changed this check above to be less conservative:

  if (CriticalEdgePredSplit.size() > 1)
      return false;

As a result we can perform load PRE in many more cases. On the
AArch64 neoverse-v1 machine we see 1% gains in both SPEC2017 perlbench
and omnetpp benchmarks. On neoverse-n1 we see 1% and 3.5% gains in
perlbench and omnetpp, respectively. I only observed one regression with
exchange2, but this is related to function specialisation and will be
fixed separately.
---
 llvm/lib/Transforms/Scalar/GVN.cpp            |  2 +-
 llvm/test/Transforms/GVN/PRE/load-pre-licm.ll | 51 ++++++++++++-------
 llvm/test/Transforms/GVN/condprop.ll          | 20 +++++---
 3 files changed, 45 insertions(+), 28 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index 5e58af0edc1556..02d7315115c4e5 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -1625,7 +1625,7 @@ bool GVNPass::PerformLoadPRE(LoadInst *Load, AvailValInBlkVect &ValuesPerBlock,
   // FIXME: If we could restructure the CFG, we could make a common pred with
   // all the preds that don't have an available Load and insert a new load into
   // that one block.
-  if (NumInsertPreds > 1)
+  if (CriticalEdgePredSplit.size() > 1)
       return false;
 
   // Now we know where we will insert load. We must ensure that it is safe
diff --git a/llvm/test/Transforms/GVN/PRE/load-pre-licm.ll b/llvm/test/Transforms/GVN/PRE/load-pre-licm.ll
index 1e3e2189cd53f5..9ec9568b3a5bce 100644
--- a/llvm/test/Transforms/GVN/PRE/load-pre-licm.ll
+++ b/llvm/test/Transforms/GVN/PRE/load-pre-licm.ll
@@ -130,11 +130,13 @@ define i32 @test3(i1 %cnd, ptr %p) {
 ; CHECK:       header:
 ; CHECK-NEXT:    br i1 [[CND:%.*]], label [[BB1:%.*]], label [[BB2:%.*]]
 ; CHECK:       bb1:
+; CHECK-NEXT:    [[V1_PRE1:%.*]] = load i32, ptr [[P:%.*]], align 4
 ; CHECK-NEXT:    br label [[MERGE:%.*]]
 ; CHECK:       bb2:
+; CHECK-NEXT:    [[V1_PRE:%.*]] = load i32, ptr [[P]], align 4
 ; CHECK-NEXT:    br label [[MERGE]]
 ; CHECK:       merge:
-; CHECK-NEXT:    [[V1:%.*]] = load i32, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = phi i32 [ [[V1_PRE]], [[BB2]] ], [ [[V1_PRE1]], [[BB1]] ]
 ; CHECK-NEXT:    call void @hold(i32 [[V1]])
 ; CHECK-NEXT:    br label [[HEADER]]
 ;
@@ -286,13 +288,17 @@ header:
 
 ; In block Z one load is unavailable (via entry block), whereas via other 2
 ; predecessors the loads are available.
-define i8 @test7a(i1 %c1, ptr %a, i8 %i, i8 %j) {
+define i8 @test7a(i1 %c1, ptr %p, i8 %i, i8 %j) {
 ; CHECK-LABEL: @test7a(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 [[C1:%.*]], label [[A:%.*]], label [[Z:%.*]]
+; CHECK-NEXT:    br i1 [[C1:%.*]], label [[A:%.*]], label [[ENTRY_Z_CRIT_EDGE:%.*]]
+; CHECK:       entry.Z_crit_edge:
+; CHECK-NEXT:    [[PK_PHI_TRANS_INSERT:%.*]] = getelementptr i8, ptr [[P:%.*]], i8 [[I:%.*]]
+; CHECK-NEXT:    [[Z_PRE1:%.*]] = load i8, ptr [[PK_PHI_TRANS_INSERT]], align 1
+; CHECK-NEXT:    br label [[Z:%.*]]
 ; CHECK:       A:
-; CHECK-NEXT:    [[PI:%.*]] = getelementptr i8, ptr [[A:%.*]], i8 [[I:%.*]]
-; CHECK-NEXT:    [[PJ:%.*]] = getelementptr i8, ptr [[A]], i8 [[J:%.*]]
+; CHECK-NEXT:    [[PI:%.*]] = getelementptr i8, ptr [[P]], i8 [[I]]
+; CHECK-NEXT:    [[PJ:%.*]] = getelementptr i8, ptr [[P]], i8 [[J:%.*]]
 ; CHECK-NEXT:    [[X:%.*]] = load i8, ptr [[PI]], align 1
 ; CHECK-NEXT:    [[Y:%.*]] = load i8, ptr [[PJ]], align 1
 ; CHECK-NEXT:    [[C2:%.*]] = icmp slt i8 [[X]], [[Y]]
@@ -300,17 +306,18 @@ define i8 @test7a(i1 %c1, ptr %a, i8 %i, i8 %j) {
 ; CHECK:       B:
 ; CHECK-NEXT:    br label [[Z]]
 ; CHECK:       Z:
-; CHECK-NEXT:    [[K:%.*]] = phi i8 [ [[I]], [[ENTRY:%.*]] ], [ [[I]], [[A]] ], [ [[J]], [[B]] ]
-; CHECK-NEXT:    [[PK:%.*]] = getelementptr i8, ptr [[A]], i8 [[K]]
-; CHECK-NEXT:    [[Z:%.*]] = load i8, ptr [[PK]], align 1
+; CHECK-NEXT:    [[Z:%.*]] = phi i8 [ [[Z_PRE1]], [[ENTRY_Z_CRIT_EDGE]] ], [ [[X]], [[A]] ], [ [[Y]], [[B]] ]
+; CHECK-NEXT:    [[K:%.*]] = phi i8 [ [[I]], [[ENTRY_Z_CRIT_EDGE]] ], [ [[I]], [[A]] ], [ [[J]], [[B]] ]
+; CHECK-NEXT:    [[PK:%.*]] = getelementptr i8, ptr [[P]], i8 [[K]]
 ; CHECK-NEXT:    ret i8 [[Z]]
 ;
+
 entry:
   br i1 %c1, label %A, label %Z
 
 A:
-  %pi = getelementptr i8, ptr %a, i8 %i
-  %pj = getelementptr i8, ptr %a, i8 %j
+  %pi = getelementptr i8, ptr %p, i8 %i
+  %pj = getelementptr i8, ptr %p, i8 %j
   %x = load i8, ptr %pi
   %y = load i8, ptr %pj
   %c2 = icmp slt i8 %x, %y
@@ -321,7 +328,7 @@ B:
 
 Z:
   %k = phi i8 [ %i, %entry ], [%i, %A], [ %j, %B ]
-  %pk = getelementptr i8, ptr %a, i8 %k
+  %pk = getelementptr i8, ptr %p, i8 %k
   %z = load i8, ptr %pk
   ret i8 %z
 }
@@ -330,28 +337,34 @@ Z:
 
 ; In block Z two loads are unavailable (via entry and B blocks), whereas via
 ; other predecessor (A) the load is available.
-define i8 @test7b(i1 %c1, ptr %a, i8 %i, i8 %j) {
+define i8 @test7b(i1 %c1, ptr %p, i8 %i, i8 %j) {
 ; CHECK-LABEL: @test7b(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 [[C1:%.*]], label [[A:%.*]], label [[Z:%.*]]
+; CHECK-NEXT:    br i1 [[C1:%.*]], label [[A:%.*]], label [[ENTRY_Z_CRIT_EDGE:%.*]]
+; CHECK:       entry.Z_crit_edge:
+; CHECK-NEXT:    [[PK_PHI_TRANS_INSERT1:%.*]] = getelementptr i8, ptr [[P:%.*]], i8 [[I:%.*]]
+; CHECK-NEXT:    [[Z_PRE2:%.*]] = load i8, ptr [[PK_PHI_TRANS_INSERT1]], align 1
+; CHECK-NEXT:    br label [[Z:%.*]]
 ; CHECK:       A:
-; CHECK-NEXT:    [[PI:%.*]] = getelementptr i8, ptr [[A:%.*]], i8 [[I:%.*]]
+; CHECK-NEXT:    [[PI:%.*]] = getelementptr i8, ptr [[P]], i8 [[I]]
 ; CHECK-NEXT:    [[X:%.*]] = load i8, ptr [[PI]], align 1
 ; CHECK-NEXT:    [[C2:%.*]] = icmp slt i8 [[X]], 3
 ; CHECK-NEXT:    br i1 [[C2]], label [[Z]], label [[B:%.*]]
 ; CHECK:       B:
+; CHECK-NEXT:    [[PK_PHI_TRANS_INSERT:%.*]] = getelementptr i8, ptr [[P]], i8 [[J:%.*]]
+; CHECK-NEXT:    [[Z_PRE:%.*]] = load i8, ptr [[PK_PHI_TRANS_INSERT]], align 1
 ; CHECK-NEXT:    br label [[Z]]
 ; CHECK:       Z:
-; CHECK-NEXT:    [[K:%.*]] = phi i8 [ [[I]], [[ENTRY:%.*]] ], [ [[I]], [[A]] ], [ [[J:%.*]], [[B]] ]
-; CHECK-NEXT:    [[PK:%.*]] = getelementptr i8, ptr [[A]], i8 [[K]]
-; CHECK-NEXT:    [[Z:%.*]] = load i8, ptr [[PK]], align 1
+; CHECK-NEXT:    [[Z:%.*]] = phi i8 [ [[Z_PRE2]], [[ENTRY_Z_CRIT_EDGE]] ], [ [[X]], [[A]] ], [ [[Z_PRE]], [[B]] ]
+; CHECK-NEXT:    [[K:%.*]] = phi i8 [ [[I]], [[ENTRY_Z_CRIT_EDGE]] ], [ [[I]], [[A]] ], [ [[J]], [[B]] ]
+; CHECK-NEXT:    [[PK:%.*]] = getelementptr i8, ptr [[P]], i8 [[K]]
 ; CHECK-NEXT:    ret i8 [[Z]]
 ;
 entry:
   br i1 %c1, label %A, label %Z
 
 A:
-  %pi = getelementptr i8, ptr %a, i8 %i
+  %pi = getelementptr i8, ptr %p, i8 %i
   %x = load i8, ptr %pi
   %c2 = icmp slt i8 %x, 3
   br i1 %c2, label %Z, label %B
@@ -361,7 +374,7 @@ B:
 
 Z:
   %k = phi i8 [ %i, %entry ], [%i, %A], [ %j, %B ]
-  %pk = getelementptr i8, ptr %a, i8 %k
+  %pk = getelementptr i8, ptr %p, i8 %k
   %z = load i8, ptr %pk
   ret i8 %z
 }
diff --git a/llvm/test/Transforms/GVN/condprop.ll b/llvm/test/Transforms/GVN/condprop.ll
index 6b1e4d10601099..f93c50201942bf 100644
--- a/llvm/test/Transforms/GVN/condprop.ll
+++ b/llvm/test/Transforms/GVN/condprop.ll
@@ -214,11 +214,11 @@ define void @test4(i1 %b, i32 %x) {
 ; CHECK-NEXT:    br i1 [[B:%.*]], label [[SW:%.*]], label [[CASE3:%.*]]
 ; CHECK:       sw:
 ; CHECK-NEXT:    switch i32 [[X:%.*]], label [[DEFAULT:%.*]] [
-; CHECK-NEXT:    i32 0, label [[CASE0:%.*]]
-; CHECK-NEXT:    i32 1, label [[CASE1:%.*]]
-; CHECK-NEXT:    i32 2, label [[CASE0]]
-; CHECK-NEXT:    i32 3, label [[CASE3]]
-; CHECK-NEXT:    i32 4, label [[DEFAULT]]
+; CHECK-NEXT:      i32 0, label [[CASE0:%.*]]
+; CHECK-NEXT:      i32 1, label [[CASE1:%.*]]
+; CHECK-NEXT:      i32 2, label [[CASE0]]
+; CHECK-NEXT:      i32 3, label [[CASE3]]
+; CHECK-NEXT:      i32 4, label [[DEFAULT]]
 ; CHECK-NEXT:    ]
 ; CHECK:       default:
 ; CHECK-NEXT:    call void @bar(i32 [[X]])
@@ -570,12 +570,16 @@ define void @test14(ptr %ptr1, ptr noalias %ptr2) {
 ; CHECK-NEXT:    br label [[THEN]]
 ; CHECK:       then:
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr [[GEP2]], [[PTR2:%.*]]
-; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP_END]], label [[IF2:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[THEN_LOOP_END_CRIT_EDGE:%.*]], label [[IF2:%.*]]
+; CHECK:       then.loop.end_crit_edge:
+; CHECK-NEXT:    [[VAL3_PRE2:%.*]] = load i32, ptr [[PTR2]], align 4
+; CHECK-NEXT:    br label [[LOOP_END]]
 ; CHECK:       if2:
+; CHECK-NEXT:    [[VAL3_PRE:%.*]] = load i32, ptr [[GEP2]], align 4
 ; CHECK-NEXT:    br label [[LOOP_END]]
 ; CHECK:       loop.end:
-; CHECK-NEXT:    [[PHI3:%.*]] = phi ptr [ [[PTR2]], [[THEN]] ], [ [[PTR1]], [[IF2]] ]
-; CHECK-NEXT:    [[VAL3]] = load i32, ptr [[GEP2]], align 4
+; CHECK-NEXT:    [[VAL3]] = phi i32 [ [[VAL3_PRE2]], [[THEN_LOOP_END_CRIT_EDGE]] ], [ [[VAL3_PRE]], [[IF2]] ]
+; CHECK-NEXT:    [[PHI3:%.*]] = phi ptr [ [[PTR2]], [[THEN_LOOP_END_CRIT_EDGE]] ], [ [[PTR1]], [[IF2]] ]
 ; CHECK-NEXT:    store i32 [[VAL3]], ptr [[PHI3]], align 4
 ; CHECK-NEXT:    br i1 undef, label [[LOOP]], label [[IF1]]
 ;



More information about the llvm-commits mailing list