[llvm] 13f6c81 - [BPF] simplify zero extension with MOV_32_64

Wed May 27 11:27:10 PDT 2020

Author: John Fastabend
Date: 2020-05-27T11:26:39-07:00
New Revision: 13f6c81c5d9a7a34a684363bcaad8eb7c65356fd

URL: https://github.com/llvm/llvm-project/commit/13f6c81c5d9a7a34a684363bcaad8eb7c65356fd
DIFF: https://github.com/llvm/llvm-project/commit/13f6c81c5d9a7a34a684363bcaad8eb7c65356fd.diff

LOG: [BPF] simplify zero extension with MOV_32_64

The current pattern matching for zext results in the following code snippet
being produced,

  w1 = w0
  r1 <<= 32
  r1 >>= 32

Because BPF implementations require zero extension on 32bit loads this
both adds a few extra unneeded instructions but also makes it a bit
harder for the verifier to track the r1 register bounds. For example in
this verifier trace we see at the end of the snippet R2 offset is unknown.
However, if we track this correctly we see w1 should have the same bounds
as r8. R8 smax is less than U32 max value so a zero extend load should keep
the same value. Adding a max value of 800 (R8=inv(id=0,smax_value=800)) to
an off=0, as seen in R7 should create a max offset of 800. However at the
end of the snippet we note the R2 max offset is 0xffffFFFF.

  R0=inv(id=0,smax_value=800)
  R1_w=inv(id=0,umax_value=2147483647,var_off=(0x0; 0x7fffffff))
  R6=ctx(id=0,off=0,imm=0) R7=map_value(id=0,off=0,ks=4,vs=1600,imm=0)
  R8_w=inv(id=0,smax_value=800,umax_value=4294967295,var_off=(0x0; 0xffffffff))
  R9=inv800 R10=fp0 fp-8=mmmm????
 58: (1c) w9 -= w8
 59: (bc) w1 = w8
 60: (67) r1 <<= 32
 61: (77) r1 >>= 32
 62: (bf) r2 = r7
 63: (0f) r2 += r1
 64: (bf) r1 = r6
 65: (bc) w3 = w9
 66: (b7) r4 = 0
 67: (85) call bpf_get_stack#67
  R0=inv(id=0,smax_value=800)
  R1_w=ctx(id=0,off=0,imm=0)
  R2_w=map_value(id=0,off=0,ks=4,vs=1600,umax_value=4294967295,var_off=(0x0; 0xffffffff))
  R3_w=inv(id=0,umax_value=800,var_off=(0x0; 0x3ff))
  R4_w=inv0 R6=ctx(id=0,off=0,imm=0)
  R7=map_value(id=0,off=0,ks=4,vs=1600,imm=0)
  R8_w=inv(id=0,smax_value=800,umax_value=4294967295,var_off=(0x0; 0xffffffff))
  R9_w=inv(id=0,umax_value=800,var_off=(0x0; 0x3ff))
  R10=fp0 fp-8=mmmm????

After this patch R1 bounds are not smashed by the <<=32 >>=32 shift and we
get correct bounds on R2 umax_value=800.

Further it reduces 3 insns to 1.

Signed-off-by: John Fastabend <john.fastabend at gmail.com>

Differential Revision: https://reviews.llvm.org/D73985

Added: 
    llvm/test/CodeGen/BPF/32-bit-subreg-zext.ll

Modified: 
    llvm/lib/Target/BPF/BPFISelLowering.cpp
    llvm/lib/Target/BPF/BPFInstrInfo.td
    llvm/lib/Target/BPF/BPFMIPeephole.cpp
    llvm/test/CodeGen/BPF/32-bit-subreg-cond-select.ll
    llvm/test/CodeGen/BPF/32-bit-subreg-peephole-phi-1.ll
    llvm/test/CodeGen/BPF/32-bit-subreg-peephole-phi-2.ll
    llvm/test/CodeGen/BPF/32-bit-subreg-peephole-phi-3.ll
    llvm/test/CodeGen/BPF/32-bit-subreg-peephole.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/BPF/BPFISelLowering.cpp b/llvm/lib/Target/BPF/BPFISelLowering.cpp
index cc8a48677538..a02556a39909 100644

--- a/llvm/lib/Target/BPF/BPFISelLowering.cpp
+++ b/llvm/lib/Target/BPF/BPFISelLowering.cpp
@@ -604,6 +604,12 @@ BPFTargetLowering::EmitSubregExt(MachineInstr &MI, MachineBasicBlock *BB,
   DebugLoc DL = MI.getDebugLoc();
 
   MachineRegisterInfo &RegInfo = F->getRegInfo();
+
+  if (!isSigned) {
+    Register PromotedReg0 = RegInfo.createVirtualRegister(RC);
+    BuildMI(BB, DL, TII.get(BPF::MOV_32_64), PromotedReg0).addReg(Reg);
+    return PromotedReg0;
+  }
   Register PromotedReg0 = RegInfo.createVirtualRegister(RC);
   Register PromotedReg1 = RegInfo.createVirtualRegister(RC);
   Register PromotedReg2 = RegInfo.createVirtualRegister(RC);

diff  --git a/llvm/lib/Target/BPF/BPFInstrInfo.td b/llvm/lib/Target/BPF/BPFInstrInfo.td
index 6781d09b846e..4298e2eaec04 100644
--- a/llvm/lib/Target/BPF/BPFInstrInfo.td
+++ b/llvm/lib/Target/BPF/BPFInstrInfo.td
@@ -732,8 +732,7 @@ let isCodeGenOnly = 1 in {
 def : Pat<(i64 (sext GPR32:$src)),
           (SRA_ri (SLL_ri (MOV_32_64 GPR32:$src), 32), 32)>;
 
-def : Pat<(i64 (zext GPR32:$src)),
-          (SRL_ri (SLL_ri (MOV_32_64 GPR32:$src), 32), 32)>;
+def : Pat<(i64 (zext GPR32:$src)), (MOV_32_64 GPR32:$src)>;
 
 // For i64 -> i32 truncation, use the 32-bit subregister directly.
 def : Pat<(i32 (trunc GPR:$src)),

diff  --git a/llvm/lib/Target/BPF/BPFMIPeephole.cpp b/llvm/lib/Target/BPF/BPFMIPeephole.cpp
index a2ceade66800..fe955fad0424 100644
--- a/llvm/lib/Target/BPF/BPFMIPeephole.cpp
+++ b/llvm/lib/Target/BPF/BPFMIPeephole.cpp
@@ -301,19 +301,16 @@ bool BPFMIPreEmitPeephole::eliminateRedundantMov(void) {
       //
       //   MOV rA, rA
       //
-      // This is particularly possible to happen when sub-register support
-      // enabled. The special type cast insn MOV_32_64 involves 
diff erent
-      // register class on src (i32) and dst (i64), RA could generate useless
-      // instruction due to this.
+      // Note that we cannot remove
+      //   MOV_32_64  rA, wA
+      //   MOV_rr_32  wA, wA
+      // as these two instructions having side effects, zeroing out
+      // top 32 bits of rA.
       unsigned Opcode = MI.getOpcode();
-      if (Opcode == BPF::MOV_32_64 ||
-          Opcode == BPF::MOV_rr || Opcode == BPF::MOV_rr_32) {
+      if (Opcode == BPF::MOV_rr) {
         Register dst = MI.getOperand(0).getReg();
         Register src = MI.getOperand(1).getReg();
 
-        if (Opcode == BPF::MOV_32_64)
-          dst = TRI->getSubReg(dst, BPF::sub_32);
-
         if (dst != src)
           continue;
 

diff  --git a/llvm/test/CodeGen/BPF/32-bit-subreg-cond-select.ll b/llvm/test/CodeGen/BPF/32-bit-subreg-cond-select.ll
index 160be56c30a3..e8a4f81a0240 100644
--- a/llvm/test/CodeGen/BPF/32-bit-subreg-cond-select.ll
+++ b/llvm/test/CodeGen/BPF/32-bit-subreg-cond-select.ll
@@ -56,8 +56,9 @@ entry:
   ret i32 %c.d
 }
 ; CHECK-LABEL: select_cc_32
-; CHECK: r{{[0-9]+}} <<= 32
-; CHECK-NEXT: r{{[0-9]+}} >>= 32
+; CHECK: r{{[0-9]+}} = w{{[0-9]+}}
+; CHECK-NOT: r{{[0-9]+}} <<= 32
+; CHECK-NOT: r{{[0-9]+}} >>= 32
 
 ; Function Attrs: norecurse nounwind readnone
 define dso_local i64 @select_cc_32_64(i32 %a, i32 %b, i64 %c, i64 %d) local_unnamed_addr #0 {
@@ -67,8 +68,9 @@ entry:
   ret i64 %c.d
 }
 ; CHECK-LABEL: select_cc_32_64
-; CHECK: r{{[0-9]+}} <<= 32
-; CHECK-NEXT: r{{[0-9]+}} >>= 32
+; CHECK: r{{[0-9]+}} = w{{[0-9]+}}
+; CHECK-NOT: r{{[0-9]+}} <<= 32
+; CHECK-NOT: r{{[0-9]+}} >>= 32
 
 ; Function Attrs: norecurse nounwind readnone
 define dso_local i32 @select_cc_64_32(i64 %a, i64 %b, i32 %c, i32 %d) local_unnamed_addr #0 {
@@ -88,8 +90,9 @@ entry:
   ret i32 %c.d
 }
 ; CHECK-LABEL: selecti_cc_32
-; CHECK: r{{[0-9]+}} <<= 32
-; CHECK-NEXT: r{{[0-9]+}} >>= 32
+; CHECK: r{{[0-9]+}} = w{{[0-9]+}}
+; CHECK-NOT: r{{[0-9]+}} <<= 32
+; CHECK-NOT: r{{[0-9]+}} >>= 32
 
 ; Function Attrs: norecurse nounwind readnone
 define dso_local i64 @selecti_cc_32_64(i32 %a, i64 %c, i64 %d) local_unnamed_addr #0 {
@@ -99,8 +102,9 @@ entry:
   ret i64 %c.d
 }
 ; CHECK-LABEL: selecti_cc_32_64
-; CHECK: r{{[0-9]+}} <<= 32
-; CHECK-NEXT: r{{[0-9]+}} >>= 32
+; CHECK: r{{[0-9]+}} = w{{[0-9]+}}
+; CHECK-NOT: r{{[0-9]+}} <<= 32
+; CHECK-NOT: r{{[0-9]+}} >>= 32
 
 ; Function Attrs: norecurse nounwind readnone
 define dso_local i32 @selecti_cc_64_32(i64 %a, i32 %c, i32 %d) local_unnamed_addr #0 {

diff  --git a/llvm/test/CodeGen/BPF/32-bit-subreg-peephole-phi-1.ll b/llvm/test/CodeGen/BPF/32-bit-subreg-peephole-phi-1.ll
index 5a72f59593c6..2fc1e6c2783b 100644
--- a/llvm/test/CodeGen/BPF/32-bit-subreg-peephole-phi-1.ll
+++ b/llvm/test/CodeGen/BPF/32-bit-subreg-peephole-phi-1.ll
@@ -27,7 +27,7 @@ entry:
   %call = tail call i32 @helper(i32 %conv)
   ret i32 %call
 }
-; CHECK: r{{[0-9]+}} >>= 32
+; CHECK: r{{[0-9]+}} = w{{[0-9]+}}
 ; CHECK-NOT: r{{[0-9]+}} >>= 32
 ; CHECK: if r{{[0-9]+}} == r{{[0-9]+}} goto
 

diff  --git a/llvm/test/CodeGen/BPF/32-bit-subreg-peephole-phi-2.ll b/llvm/test/CodeGen/BPF/32-bit-subreg-peephole-phi-2.ll
index 46a1b231c1f0..da69657d02d0 100644
--- a/llvm/test/CodeGen/BPF/32-bit-subreg-peephole-phi-2.ll
+++ b/llvm/test/CodeGen/BPF/32-bit-subreg-peephole-phi-2.ll
@@ -27,8 +27,8 @@ entry:
   %call = tail call i32 @helper(i32 %conv)
   ret i32 %call
 }
-; CHECK: r{{[0-9]+}} >>= 32
-; CHECK: r{{[0-9]+}} >>= 32
+; CHECK: r{{[0-9]+}} = w{{[0-9]+}}
+; CHECK: r{{[0-9]+}} = w{{[0-9]+}}
 ; CHECK: if r{{[0-9]+}} == r{{[0-9]+}} goto
 
 declare dso_local i32 @helper(i32) local_unnamed_addr

diff  --git a/llvm/test/CodeGen/BPF/32-bit-subreg-peephole-phi-3.ll b/llvm/test/CodeGen/BPF/32-bit-subreg-peephole-phi-3.ll
index d46214032e6e..3f3f9c8c4a55 100644
--- a/llvm/test/CodeGen/BPF/32-bit-subreg-peephole-phi-3.ll
+++ b/llvm/test/CodeGen/BPF/32-bit-subreg-peephole-phi-3.ll
@@ -44,8 +44,9 @@ for.body:                                         ; preds = %for.body, %entry
   %exitcond = icmp eq i64 %inc, 100
   br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !2
 }
-; CHECK: [[VAL:r[0-9]+]] <<= 32
-; CHECK: [[VAL]] >>= 32
+; CHECK: [[VAL:r[0-9]+]] = w{{[0-9]+}}
+; CHECK-NOT: [[VAL:r[0-9]+]] <<= 32
+; CHECK-NOT: [[VAL]] >>= 32
 ; CHECK: if [[VAL]] == 0 goto
 
 !2 = distinct !{!2, !3}

diff  --git a/llvm/test/CodeGen/BPF/32-bit-subreg-peephole.ll b/llvm/test/CodeGen/BPF/32-bit-subreg-peephole.ll
index 63a7c25ed33b..7c5be7f1987a 100644
--- a/llvm/test/CodeGen/BPF/32-bit-subreg-peephole.ll
+++ b/llvm/test/CodeGen/BPF/32-bit-subreg-peephole.ll
@@ -47,8 +47,9 @@ define dso_local i64 @select_u(i32 %a, i32 %b, i64 %c, i64 %d) local_unnamed_add
 entry:
   %cmp = icmp ugt i32 %a, %b
   %c.d = select i1 %cmp, i64 %c, i64 %d
-; CHECK: r{{[0-9]+}} <<= 32
-; CHECK-NEXT: r{{[0-9]+}} >>= 32
+; CHECK: r{{[0-9]+}} = w{{[0-9]+}}
+; CHECK-NOT: r{{[0-9]+}} <<= 32
+; CHECK-NOT: r{{[0-9]+}} >>= 32
 ; CHECK: if r{{[0-9]+}} {{<|>}} r{{[0-9]+}} goto
   ret i64 %c.d
 }
@@ -58,8 +59,9 @@ define dso_local i64 @select_u_2(i32 %a, i64 %b, i64 %c, i64 %d) local_unnamed_a
 ; CHECK-LABEL: select_u_2:
 entry:
   %conv = zext i32 %a to i64
-; CHECK: r{{[0-9]+}} <<= 32
-; CHECK-NEXT: r{{[0-9]+}} >>= 32
+; CHECK: r{{[0-9]+}} = w{{[0-9]+}}
+; CHECK-NOT: r{{[0-9]+}} <<= 32
+; CHECK-NOT: r{{[0-9]+}} >>= 32
   %cmp = icmp ugt i64 %conv, %b
   %c.d = select i1 %cmp, i64 %c, i64 %d
   ret i64 %c.d
@@ -84,10 +86,11 @@ entry:
   %call = tail call i64 bitcast (i64 (...)* @bar to i64 ()*)() #2
   %conv = trunc i64 %call to i32
   %cmp = icmp ult i32 %conv, 10
-; The shifts can't be optimized out because %call comes from function call
-; returning i64 so the high bits might be valid.
-; CHECK: r{{[0-9]+}} <<= 32
-; CHECK-NEXT: r{{[0-9]+}} >>= 32
+; %call comes from function call returning i64 so the high bits will need
+; to be cleared.
+; CHECK: r{{[0-9]+}} = w{{[0-9]+}}
+; CHECK-NOT: r{{[0-9]+}} <<= 32
+; CHECK-NOT: r{{[0-9]+}} >>= 32
   %b.c = select i1 %cmp, i32 %b, i32 %c
 ; CHECK: if r{{[0-9]+}} {{<|>}} {{[0-9]+}} goto
   ret i32 %b.c
@@ -100,8 +103,9 @@ define dso_local i32* @inc_p(i32* readnone %p, i32 %a) local_unnamed_addr #0 {
 ; CHECK-LABEL: inc_p:
 entry:
   %idx.ext = zext i32 %a to i64
-; CHECK: r{{[0-9]+}} <<= 32
-; CHECK-NEXT: r{{[0-9]+}} >>= 32
+; CHECK: r{{[0-9]+}} = w{{[0-9]+}}
+; CHECK-NOT: r{{[0-9]+}} <<= 32
+; CHECK-NOT: r{{[0-9]+}} >>= 32
   %add.ptr = getelementptr inbounds i32, i32* %p, i64 %idx.ext
   ret i32* %add.ptr
 }

diff  --git a/llvm/test/CodeGen/BPF/32-bit-subreg-zext.ll b/llvm/test/CodeGen/BPF/32-bit-subreg-zext.ll
new file mode 100644
index 000000000000..57ea93a8fe6e
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/32-bit-subreg-zext.ll
@@ -0,0 +1,21 @@
+; RUN: llc -O2 -march=bpfel -mattr=+alu32 < %s | FileCheck %s
+; RUN: llc -O2 -march=bpfel -mcpu=v3 < %s | FileCheck %s
+; RUN: llc -O2 -march=bpfeb -mattr=+alu32 < %s | FileCheck %s
+; RUN: llc -O2 -march=bpfeb -mcpu=v3 < %s | FileCheck %s
+;
+; long zext(unsigned int a)
+; {
+;   long b = a;
+;   return b;
+; }
+
+; Function Attrs: norecurse nounwind
+define dso_local i64 @zext(i32 %a) local_unnamed_addr #0 {
+entry:
+  %conv = zext i32 %a to i64
+  ; CHECK-NOT: r[[#]] <<= 32
+  ; CHECK-NOT: r[[#]] >>= 32
+  ret i64 %conv
+}
+
+attributes #0 = { norecurse nounwind }