[llvm] 8cc4830 - [RISCV] Teach RISCVCodeGenPrepare to optimize (i64 (and (zext/sext (i32 X), C1)))

Sun Jul 17 11:06:39 PDT 2022

Author: Craig Topper
Date: 2022-07-17T11:00:56-07:00
New Revision: 8cc483099a0431f3dc8181b4af0f5aeba1b97ba9

URL: https://github.com/llvm/llvm-project/commit/8cc483099a0431f3dc8181b4af0f5aeba1b97ba9
DIFF: https://github.com/llvm/llvm-project/commit/8cc483099a0431f3dc8181b4af0f5aeba1b97ba9.diff

LOG: [RISCV] Teach RISCVCodeGenPrepare to optimize (i64 (and (zext/sext (i32 X), C1)))

If X is known positive by a dominating condition, we can fill in
ones into the upper bits of C1 if that would allow it to become an
simm12 allowing the use of ANDI.

This pattern often occurs in unrolled loops where the induction
variable has been widened.

To get the best benefit from this, I had to move the pass above
ConstantHoisting which is in addIRPasses. Otherwise the AND constant
is often hoisted away from the AND.

Reviewed By: asb

Differential Revision: https://reviews.llvm.org/D129888

Added: 
    

Modified: 
    llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp
    llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
    llvm/test/CodeGen/RISCV/O3-pipeline.ll
    llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll
    llvm/test/CodeGen/RISCV/riscv-codegenprepare.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp
index 469d19f59cef..b700a9ede39b 100644

--- a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp
+++ b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp
@@ -50,6 +50,7 @@ class RISCVCodeGenPrepare : public FunctionPass {
 
 private:
   bool optimizeZExt(ZExtInst *I);
+  bool optimizeAndExt(BinaryOperator *BO);
 };
 
 } // end anonymous namespace
@@ -83,6 +84,57 @@ bool RISCVCodeGenPrepare::optimizeZExt(ZExtInst *ZExt) {
   return false;
 }
 
+// Try to optimize (i64 (and (zext/sext (i32 X), C1))) if C1 has bit 31 set,
+// but bits 63:32 are zero. If we can prove that bit 31 of X is 0, we can fill
+// the upper 32 bits with ones. A separate transform will turn (zext X) into
+// (sext X) for the same condition.
+bool RISCVCodeGenPrepare::optimizeAndExt(BinaryOperator *BO) {
+  if (!ST->is64Bit())
+    return false;
+
+  if (BO->getOpcode() != Instruction::And)
+    return false;
+
+  if (!BO->getType()->isIntegerTy(64))
+    return false;
+
+  // Left hand side should be sext or zext.
+  Instruction *LHS = dyn_cast<Instruction>(BO->getOperand(0));
+  if (!LHS || (!isa<SExtInst>(LHS) && !isa<ZExtInst>(LHS)))
+    return false;
+
+  Value *LHSSrc = LHS->getOperand(0);
+  if (!LHSSrc->getType()->isIntegerTy(32))
+    return false;
+
+  // Right hand side should be a constant.
+  Value *RHS = BO->getOperand(1);
+
+  auto *CI = dyn_cast<ConstantInt>(RHS);
+  if (!CI)
+    return false;
+  uint64_t C = CI->getZExtValue();
+
+  // Look for constants that fit in 32 bits but not simm12, and can be made
+  // into simm12 by sign extending bit 31. This will allow use of ANDI.
+  // TODO: Is worth making simm32?
+  if (!isUInt<32>(C) || isInt<12>(C) || !isInt<12>(SignExtend64<32>(C)))
+    return false;
+
+  // If we can determine the sign bit of the input is 0, we can replace the
+  // And mask constant.
+  if (!isImpliedByDomCondition(ICmpInst::ICMP_SGE, LHSSrc,
+                               Constant::getNullValue(LHSSrc->getType()),
+                               LHS, *DL))
+    return false;
+
+  // Sign extend the constant and replace the And operand.
+  C = SignExtend64<32>(C);
+  BO->setOperand(1, ConstantInt::get(LHS->getType(), C));
+
+  return true;
+}
+
 bool RISCVCodeGenPrepare::runOnFunction(Function &F) {
   if (skipFunction(F))
     return false;
@@ -98,6 +150,8 @@ bool RISCVCodeGenPrepare::runOnFunction(Function &F) {
     for (Instruction &I : llvm::make_early_inc_range(BB)) {
       if (auto *ZExt = dyn_cast<ZExtInst>(&I))
         MadeChange |= optimizeZExt(ZExt);
+      else if (I.getOpcode() == Instruction::And)
+        MadeChange |= optimizeAndExt(cast<BinaryOperator>(&I));
     }
   }
 

diff  --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index b71b0e6f60c1..50fcb00e6c63 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -166,7 +166,6 @@ class RISCVPassConfig : public TargetPassConfig {
   }
 
   void addIRPasses() override;
-  void addCodeGenPrepare() override;
   bool addPreISel() override;
   bool addInstSelector() override;
   bool addIRTranslator() override;
@@ -192,13 +191,10 @@ void RISCVPassConfig::addIRPasses() {
   if (getOptLevel() != CodeGenOpt::None)
     addPass(createRISCVGatherScatterLoweringPass());
 
-  TargetPassConfig::addIRPasses();
-}
-
-void RISCVPassConfig::addCodeGenPrepare() {
   if (getOptLevel() != CodeGenOpt::None)
     addPass(createRISCVCodeGenPreparePass());
-  TargetPassConfig::addCodeGenPrepare();
+
+  TargetPassConfig::addIRPasses();
 }
 
 bool RISCVPassConfig::addPreISel() {

diff  --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
index 5782312fbb9f..a2e6bb7a1354 100644
--- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
@@ -26,6 +26,7 @@
 ; CHECK-NEXT:       Dominator Tree Construction
 ; CHECK-NEXT:       Natural Loop Information
 ; CHECK-NEXT:       RISCV gather/scatter lowering
+; CHECK-NEXT:       RISCV CodeGenPrepare
 ; CHECK-NEXT:       Module Verifier
 ; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
 ; CHECK-NEXT:       Canonicalize natural loops
@@ -57,7 +58,6 @@
 ; CHECK-NEXT:       Expand reduction intrinsics
 ; CHECK-NEXT:       Natural Loop Information
 ; CHECK-NEXT:       TLS Variable Hoist
-; CHECK-NEXT:       RISCV CodeGenPrepare
 ; CHECK-NEXT:       CodeGen Prepare
 ; CHECK-NEXT:       Dominator Tree Construction
 ; CHECK-NEXT:       Exception handling preparation

diff  --git a/llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll b/llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll
index 9bef8e024c67..2fd283d600e0 100644
--- a/llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll
+++ b/llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll
@@ -40,3 +40,88 @@ for.body:                                         ; preds = %for.body.preheader,
   %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 }
+
+; Make sure we convert the 4294967294 in for.body.preheader.new to -2 based on
+; the upper 33 bits being zero by the dominating condition %cmp3.
+define void @test2(ptr nocapture noundef %a, i32 noundef signext %n) {
+; CHECK-LABEL: test2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    blez a1, .LBB1_7
+; CHECK-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-NEXT:    li a3, 1
+; CHECK-NEXT:    andi a2, a1, 1
+; CHECK-NEXT:    bne a1, a3, .LBB1_3
+; CHECK-NEXT:  # %bb.2:
+; CHECK-NEXT:    li a3, 0
+; CHECK-NEXT:    j .LBB1_5
+; CHECK-NEXT:  .LBB1_3: # %for.body.preheader.new
+; CHECK-NEXT:    li a3, 0
+; CHECK-NEXT:    andi a1, a1, -2
+; CHECK-NEXT:    addi a4, a0, 4
+; CHECK-NEXT:  .LBB1_4: # %for.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    lw a5, -4(a4)
+; CHECK-NEXT:    lw a6, 0(a4)
+; CHECK-NEXT:    addiw a5, a5, 4
+; CHECK-NEXT:    sw a5, -4(a4)
+; CHECK-NEXT:    addiw a5, a6, 4
+; CHECK-NEXT:    sw a5, 0(a4)
+; CHECK-NEXT:    addi a3, a3, 2
+; CHECK-NEXT:    addi a4, a4, 8
+; CHECK-NEXT:    bne a1, a3, .LBB1_4
+; CHECK-NEXT:  .LBB1_5: # %for.cond.cleanup.loopexit.unr-lcssa
+; CHECK-NEXT:    beqz a2, .LBB1_7
+; CHECK-NEXT:  # %bb.6: # %for.body.epil
+; CHECK-NEXT:    slli a1, a3, 2
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    lw a1, 0(a0)
+; CHECK-NEXT:    addiw a1, a1, 4
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:  .LBB1_7: # %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %cmp3 = icmp sgt i32 %n, 0
+  br i1 %cmp3, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  %xtraiter = and i64 %wide.trip.count, 1
+  %0 = icmp eq i32 %n, 1
+  br i1 %0, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
+
+for.body.preheader.new:                           ; preds = %for.body.preheader
+  %unroll_iter = and i64 %wide.trip.count, 4294967294
+  br label %for.body
+
+for.cond.cleanup.loopexit.unr-lcssa:              ; preds = %for.body, %for.body.preheader
+  %indvars.iv.unr = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next.1, %for.body ]
+  %lcmp.mod.not = icmp eq i64 %xtraiter, 0
+  br i1 %lcmp.mod.not, label %for.cond.cleanup, label %for.body.epil
+
+for.body.epil:                                    ; preds = %for.cond.cleanup.loopexit.unr-lcssa
+  %arrayidx.epil = getelementptr inbounds i32, ptr %a, i64 %indvars.iv.unr
+  %1 = load i32, ptr %arrayidx.epil, align 4
+  %add.epil = add nsw i32 %1, 4
+  store i32 %add.epil, ptr %arrayidx.epil, align 4
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body, %for.body.preheader.new
+  %indvars.iv = phi i64 [ 0, %for.body.preheader.new ], [ %indvars.iv.next.1, %for.body ]
+  %niter = phi i64 [ 0, %for.body.preheader.new ], [ %niter.next.1, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
+  %2 = load i32, ptr %arrayidx, align 4
+  %add = add nsw i32 %2, 4
+  store i32 %add, ptr %arrayidx, align 4
+  %indvars.iv.next = or i64 %indvars.iv, 1
+  %arrayidx.1 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv.next
+  %3 = load i32, ptr %arrayidx.1, align 4
+  %add.1 = add nsw i32 %3, 4
+  store i32 %add.1, ptr %arrayidx.1, align 4
+  %indvars.iv.next.1 = add nuw nsw i64 %indvars.iv, 2
+  %niter.next.1 = add i64 %niter, 2
+  %niter.ncmp.1 = icmp eq i64 %niter.next.1, %unroll_iter
+  br i1 %niter.ncmp.1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
+}

diff  --git a/llvm/test/CodeGen/RISCV/riscv-codegenprepare.ll b/llvm/test/CodeGen/RISCV/riscv-codegenprepare.ll
index 2b33ccac8649..b7530c80c417 100644
--- a/llvm/test/CodeGen/RISCV/riscv-codegenprepare.ll
+++ b/llvm/test/CodeGen/RISCV/riscv-codegenprepare.ll
@@ -51,3 +51,94 @@ for.body:                                         ; preds = %for.body.preheader,
   %exitcond.not = icmp eq i64 %lsr.iv.next, 0
   br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
 }
+
+; Make sure we convert the 4294967294 in for.body.preheader.new to -2 based on
+; the upper 33 bits being zero by the dominating condition %cmp3.
+define void @test2(ptr nocapture noundef %a, i32 noundef signext %n) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP3]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = sext i32 [[N]] to i64
+; CHECK-NEXT:    [[XTRAITER:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[N]], 1
+; CHECK-NEXT:    br i1 [[TMP0]], label [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA:%.*]], label [[FOR_BODY_PREHEADER_NEW:%.*]]
+; CHECK:       for.body.preheader.new:
+; CHECK-NEXT:    [[UNROLL_ITER:%.*]] = and i64 [[WIDE_TRIP_COUNT]], -2
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit.unr-lcssa:
+; CHECK-NEXT:    [[INDVARS_IV_UNR:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_1:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0
+; CHECK-NEXT:    br i1 [[LCMP_MOD_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY_EPIL:%.*]]
+; CHECK:       for.body.epil:
+; CHECK-NEXT:    [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV_UNR]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX_EPIL]], align 4
+; CHECK-NEXT:    [[ADD_EPIL:%.*]] = add nsw i32 [[TMP1]], 4
+; CHECK-NEXT:    store i32 [[ADD_EPIL]], ptr [[ARRAYIDX_EPIL]], align 4
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER_NEW]] ], [ [[INDVARS_IV_NEXT_1]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[NITER:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER_NEW]] ], [ [[NITER_NEXT_1:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP2]], 4
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV_NEXT]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX_1]], align 4
+; CHECK-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[TMP3]], 4
+; CHECK-NEXT:    store i32 [[ADD_1]], ptr [[ARRAYIDX_1]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT_1]] = add nuw nsw i64 [[INDVARS_IV]], 2
+; CHECK-NEXT:    [[NITER_NEXT_1]] = add i64 [[NITER]], 2
+; CHECK-NEXT:    [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NEXT_1]], [[UNROLL_ITER]]
+; CHECK-NEXT:    br i1 [[NITER_NCMP_1]], label [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA]], label [[FOR_BODY]]
+;
+entry:
+  %cmp3 = icmp sgt i32 %n, 0
+  br i1 %cmp3, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  %xtraiter = and i64 %wide.trip.count, 1
+  %0 = icmp eq i32 %n, 1
+  br i1 %0, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
+
+for.body.preheader.new:                           ; preds = %for.body.preheader
+  %unroll_iter = and i64 %wide.trip.count, 4294967294
+  br label %for.body
+
+for.cond.cleanup.loopexit.unr-lcssa:              ; preds = %for.body, %for.body.preheader
+  %indvars.iv.unr = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next.1, %for.body ]
+  %lcmp.mod.not = icmp eq i64 %xtraiter, 0
+  br i1 %lcmp.mod.not, label %for.cond.cleanup, label %for.body.epil
+
+for.body.epil:                                    ; preds = %for.cond.cleanup.loopexit.unr-lcssa
+  %arrayidx.epil = getelementptr inbounds i32, ptr %a, i64 %indvars.iv.unr
+  %1 = load i32, ptr %arrayidx.epil, align 4
+  %add.epil = add nsw i32 %1, 4
+  store i32 %add.epil, ptr %arrayidx.epil, align 4
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body, %for.body.preheader.new
+  %indvars.iv = phi i64 [ 0, %for.body.preheader.new ], [ %indvars.iv.next.1, %for.body ]
+  %niter = phi i64 [ 0, %for.body.preheader.new ], [ %niter.next.1, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
+  %2 = load i32, ptr %arrayidx, align 4
+  %add = add nsw i32 %2, 4
+  store i32 %add, ptr %arrayidx, align 4
+  %indvars.iv.next = or i64 %indvars.iv, 1
+  %arrayidx.1 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv.next
+  %3 = load i32, ptr %arrayidx.1, align 4
+  %add.1 = add nsw i32 %3, 4
+  store i32 %add.1, ptr %arrayidx.1, align 4
+  %indvars.iv.next.1 = add nuw nsw i64 %indvars.iv, 2
+  %niter.next.1 = add i64 %niter, 2
+  %niter.ncmp.1 = icmp eq i64 %niter.next.1, %unroll_iter
+  br i1 %niter.ncmp.1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
+}