[llvm] 019ab61 - [NVPTX, LSV] Move the LSV optimization pass to later when the graph is cleaner

Thu Feb 13 12:16:12 PST 2020

Author: Frederic Bastien
Date: 2020-02-13T12:15:38-08:00
New Revision: 019ab61e25f2f03c91063d2bafad6f22b6646ffe

URL: https://github.com/llvm/llvm-project/commit/019ab61e25f2f03c91063d2bafad6f22b6646ffe
DIFF: https://github.com/llvm/llvm-project/commit/019ab61e25f2f03c91063d2bafad6f22b6646ffe.diff

LOG: [NVPTX, LSV] Move the LSV optimization pass to later when the graph is cleaner

This allow it to recognize more loads as being consecutive when the load's address are complex at the start.

Differential Revision: https://reviews.llvm.org/D74444

Added: 
    

Modified: 
    llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
    llvm/test/CodeGen/NVPTX/vector-loads.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 77487d1c99a4..85709eb731e2 100644

--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -276,8 +276,6 @@ void NVPTXPassConfig::addIRPasses() {
   addPass(createNVPTXLowerArgsPass(&getNVPTXTargetMachine()));
   if (getOptLevel() != CodeGenOpt::None) {
     addAddressSpaceInferencePasses();
-    if (!DisableLoadStoreVectorizer)
-      addPass(createLoadStoreVectorizerPass());
     addStraightLineScalarOptimizationPasses();
   }
 
@@ -295,8 +293,11 @@ void NVPTXPassConfig::addIRPasses() {
   //   %1 = shl %a, 2
   //
   // but EarlyCSE can do neither of them.
-  if (getOptLevel() != CodeGenOpt::None)
+  if (getOptLevel() != CodeGenOpt::None) {
     addEarlyCSEOrGVNPass();
+    if (!DisableLoadStoreVectorizer)
+      addPass(createLoadStoreVectorizerPass());
+  }
 }
 
 bool NVPTXPassConfig::addInstSelector() {

diff  --git a/llvm/test/CodeGen/NVPTX/vector-loads.ll b/llvm/test/CodeGen/NVPTX/vector-loads.ll
index d70348942200..ab76a14144c3 100644
--- a/llvm/test/CodeGen/NVPTX/vector-loads.ll
+++ b/llvm/test/CodeGen/NVPTX/vector-loads.ll
@@ -7,8 +7,8 @@
 ;
 ; which will load two floats at once into scalar registers.
 
+; CHECK-LABEL foo
 define void @foo(<2 x float>* %a) {
-; CHECK: .func foo
 ; CHECK: ld.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}
   %t1 = load <2 x float>, <2 x float>* %a
   %t2 = fmul <2 x float> %t1, %t1
@@ -16,8 +16,8 @@ define void @foo(<2 x float>* %a) {
   ret void
 }
 
+; CHECK-LABEL foo2
 define void @foo2(<4 x float>* %a) {
-; CHECK: .func foo2
 ; CHECK: ld.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
   %t1 = load <4 x float>, <4 x float>* %a
   %t2 = fmul <4 x float> %t1, %t1
@@ -25,8 +25,8 @@ define void @foo2(<4 x float>* %a) {
   ret void
 }
 
+; CHECK-LABEL foo3
 define void @foo3(<8 x float>* %a) {
-; CHECK: .func foo3
 ; CHECK: ld.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
 ; CHECK-NEXT: ld.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
   %t1 = load <8 x float>, <8 x float>* %a
@@ -37,8 +37,8 @@ define void @foo3(<8 x float>* %a) {
 
 
 
+; CHECK-LABEL foo4
 define void @foo4(<2 x i32>* %a) {
-; CHECK: .func foo4
 ; CHECK: ld.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}
   %t1 = load <2 x i32>, <2 x i32>* %a
   %t2 = mul <2 x i32> %t1, %t1
@@ -46,8 +46,8 @@ define void @foo4(<2 x i32>* %a) {
   ret void
 }
 
+; CHECK-LABEL foo5
 define void @foo5(<4 x i32>* %a) {
-; CHECK: .func foo5
 ; CHECK: ld.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
   %t1 = load <4 x i32>, <4 x i32>* %a
   %t2 = mul <4 x i32> %t1, %t1
@@ -55,8 +55,8 @@ define void @foo5(<4 x i32>* %a) {
   ret void
 }
 
+; CHECK-LABEL foo6
 define void @foo6(<8 x i32>* %a) {
-; CHECK: .func foo6
 ; CHECK: ld.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
 ; CHECK-NEXT: ld.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
   %t1 = load <8 x i32>, <8 x i32>* %a
@@ -64,3 +64,38 @@ define void @foo6(<8 x i32>* %a) {
   store <8 x i32> %t2, <8 x i32>* %a
   ret void
 }
+
+; The following test wasn't passing previously as the address
+; computation was still too complex when LSV was called.
+declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #0
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
+; CHECK-LABEL foo_complex
+define void @foo_complex(i8* nocapture readonly align 16 dereferenceable(134217728) %alloc0) {
+  %targ0.1.typed = bitcast i8* %alloc0 to [1024 x [131072 x i8]]*
+  %t0 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !1
+  %t1 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
+  %t2 = lshr i32 %t1, 8
+  %t3 = shl nuw nsw i32 %t1, 9
+  %ttile_origin.2 = and i32 %t3, 130560
+  %tstart_offset_x_mul = shl nuw nsw i32 %t0, 1
+  %t4 = or i32 %ttile_origin.2, %tstart_offset_x_mul
+  %t6 = or i32 %t4, 1
+  %t8 = or i32 %t4, 128
+  %t9 = zext i32 %t8 to i64
+  %t10 = or i32 %t4, 129
+  %t11 = zext i32 %t10 to i64
+  %t20 = zext i32 %t2 to i64
+  %t27 = getelementptr inbounds [1024 x [131072 x i8]], [1024 x [131072 x i8]]* %targ0.1.typed, i64 0, i64 %t20, i64 %t9
+; CHECK: ld.v2.u8
+  %t28 = load i8, i8* %t27, align 2
+  %t31 = getelementptr inbounds [1024 x [131072 x i8]], [1024 x [131072 x i8]]* %targ0.1.typed, i64 0, i64 %t20, i64 %t11
+  %t32 = load i8, i8* %t31, align 1
+  %t33 = icmp ult i8 %t28, %t32
+  %t34 = select i1 %t33, i8 %t32, i8 %t28
+  store i8 %t34, i8* %t31
+; CHECK: ret
+  ret void
+}
+
+
+!1 = !{i32 0, i32 64}