[llvm] r219185 - [FastISel][AArch64] Fix "Fold sign-/zero-extends into the load instruction."

Mon Oct 6 20:39:59 PDT 2014

Author: ributzka
Date: Mon Oct  6 22:39:59 2014
New Revision: 219185

URL: http://llvm.org/viewvc/llvm-project?rev=219185&view=rev
Log:
[FastISel][AArch64] Fix "Fold sign-/zero-extends into the load instruction."

This commit fixes an issue with sign-/zero-extending loads that was discovered
by Richard Barton.

We use now the correct load instructions for sign-extending loads to 64bit. Also
updated and added more unit tests.

Modified:
    llvm/trunk/lib/Target/AArch64/AArch64FastISel.cpp
    llvm/trunk/test/CodeGen/AArch64/fast-isel-int-ext.ll

Modified: llvm/trunk/lib/Target/AArch64/AArch64FastISel.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64FastISel.cpp?rev=219185&r1=219184&r2=219185&view=diff
==============================================================================

--- llvm/trunk/lib/Target/AArch64/AArch64FastISel.cpp (original)
+++ llvm/trunk/lib/Target/AArch64/AArch64FastISel.cpp Mon Oct  6 22:39:59 2014
@@ -178,8 +178,8 @@ private:
   bool emitICmp(MVT RetVT, const Value *LHS, const Value *RHS, bool IsZExt);
   bool emitICmp_ri(MVT RetVT, unsigned LHSReg, bool LHSIsKill, uint64_t Imm);
   bool emitFCmp(MVT RetVT, const Value *LHS, const Value *RHS);
-  bool emitLoad(MVT VT, unsigned &ResultReg, Address Addr, bool WantZExt = true,
-                MachineMemOperand *MMO = nullptr);
+  bool emitLoad(MVT VT, MVT ResultVT, unsigned &ResultReg, Address Addr,
+                bool WantZExt = true, MachineMemOperand *MMO = nullptr);
   bool emitStore(MVT VT, unsigned SrcReg, Address Addr,
                  MachineMemOperand *MMO = nullptr);
   unsigned emitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, bool isZExt);
@@ -260,6 +260,8 @@ public:
 static bool isIntExtFree(const Instruction *I) {
   assert((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
          "Unexpected integer extend instruction.");
+  assert(!I->getType()->isVectorTy() && I->getType()->isIntegerTy() &&
+         "Unexpected value type.");
   bool IsZExt = isa<ZExtInst>(I);
 
   if (const auto *LI = dyn_cast<LoadInst>(I->getOperand(0)))
@@ -1589,8 +1591,9 @@ unsigned AArch64FastISel::emitAnd_ri(MVT
   return emitLogicalOp_ri(ISD::AND, RetVT, LHSReg, LHSIsKill, Imm);
 }
 
-bool AArch64FastISel::emitLoad(MVT VT, unsigned &ResultReg, Address Addr,
-                               bool WantZExt, MachineMemOperand *MMO) {
+bool AArch64FastISel::emitLoad(MVT VT, MVT RetVT, unsigned &ResultReg,
+                               Address Addr, bool WantZExt,
+                               MachineMemOperand *MMO) {
   // Simplify this down to something we can handle.
   if (!simplifyAddress(Addr, VT))
     return false;
@@ -1607,25 +1610,41 @@ bool AArch64FastISel::emitLoad(MVT VT, u
     ScaleFactor = 1;
   }
 
-  static const unsigned GPOpcTable[2][4][4] = {
+  static const unsigned GPOpcTable[2][8][4] = {
     // Sign-extend.
-    { { AArch64::LDURSBWi,  AArch64::LDURSHWi,  AArch64::LDURSWi,
+    { { AArch64::LDURSBWi,  AArch64::LDURSHWi,  AArch64::LDURWi,
+        AArch64::LDURXi  },
+      { AArch64::LDURSBXi,  AArch64::LDURSHXi,  AArch64::LDURSWi,
         AArch64::LDURXi  },
-      { AArch64::LDRSBWui,  AArch64::LDRSHWui,  AArch64::LDRSWui,
+      { AArch64::LDRSBWui,  AArch64::LDRSHWui,  AArch64::LDRWui,
         AArch64::LDRXui  },
-      { AArch64::LDRSBWroX, AArch64::LDRSHWroX, AArch64::LDRSWroX,
+      { AArch64::LDRSBXui,  AArch64::LDRSHXui,  AArch64::LDRSWui,
+        AArch64::LDRXui  },
+      { AArch64::LDRSBWroX, AArch64::LDRSHWroX, AArch64::LDRWroX,
+        AArch64::LDRXroX },
+      { AArch64::LDRSBXroX, AArch64::LDRSHXroX, AArch64::LDRSWroX,
         AArch64::LDRXroX },
-      { AArch64::LDRSBWroW, AArch64::LDRSHWroW, AArch64::LDRSWroW,
+      { AArch64::LDRSBWroW, AArch64::LDRSHWroW, AArch64::LDRWroW,
         AArch64::LDRXroW },
+      { AArch64::LDRSBXroW, AArch64::LDRSHXroW, AArch64::LDRSWroW,
+        AArch64::LDRXroW }
     },
     // Zero-extend.
     { { AArch64::LDURBBi,   AArch64::LDURHHi,   AArch64::LDURWi,
         AArch64::LDURXi  },
+      { AArch64::LDURBBi,   AArch64::LDURHHi,   AArch64::LDURWi,
+        AArch64::LDURXi  },
       { AArch64::LDRBBui,   AArch64::LDRHHui,   AArch64::LDRWui,
         AArch64::LDRXui  },
+      { AArch64::LDRBBui,   AArch64::LDRHHui,   AArch64::LDRWui,
+        AArch64::LDRXui  },
+      { AArch64::LDRBBroX,  AArch64::LDRHHroX,  AArch64::LDRWroX,
+        AArch64::LDRXroX },
       { AArch64::LDRBBroX,  AArch64::LDRHHroX,  AArch64::LDRWroX,
         AArch64::LDRXroX },
       { AArch64::LDRBBroW,  AArch64::LDRHHroW,  AArch64::LDRWroW,
+        AArch64::LDRXroW },
+      { AArch64::LDRBBroW,  AArch64::LDRHHroW,  AArch64::LDRWroW,
         AArch64::LDRXroW }
     }
   };
@@ -1646,24 +1665,28 @@ bool AArch64FastISel::emitLoad(MVT VT, u
       Addr.getExtendType() == AArch64_AM::SXTW)
     Idx++;
 
+  bool IsRet64Bit = RetVT == MVT::i64;
   switch (VT.SimpleTy) {
   default:
     llvm_unreachable("Unexpected value type.");
   case MVT::i1: // Intentional fall-through.
   case MVT::i8:
-    Opc = GPOpcTable[WantZExt][Idx][0];
-    RC = &AArch64::GPR32RegClass;
+    Opc = GPOpcTable[WantZExt][2 * Idx + IsRet64Bit][0];
+    RC = (IsRet64Bit && !WantZExt) ?
+             &AArch64::GPR64RegClass: &AArch64::GPR32RegClass;
     break;
   case MVT::i16:
-    Opc = GPOpcTable[WantZExt][Idx][1];
-    RC = &AArch64::GPR32RegClass;
+    Opc = GPOpcTable[WantZExt][2 * Idx + IsRet64Bit][1];
+    RC = (IsRet64Bit && !WantZExt) ?
+             &AArch64::GPR64RegClass: &AArch64::GPR32RegClass;
     break;
   case MVT::i32:
-    Opc = GPOpcTable[WantZExt][Idx][2];
-    RC = WantZExt ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass;
+    Opc = GPOpcTable[WantZExt][2 * Idx + IsRet64Bit][2];
+    RC = (IsRet64Bit && !WantZExt) ?
+             &AArch64::GPR64RegClass: &AArch64::GPR32RegClass;
     break;
   case MVT::i64:
-    Opc = GPOpcTable[WantZExt][Idx][3];
+    Opc = GPOpcTable[WantZExt][2 * Idx + IsRet64Bit][3];
     RC = &AArch64::GPR64RegClass;
     break;
   case MVT::f32:
@@ -1682,15 +1705,22 @@ bool AArch64FastISel::emitLoad(MVT VT, u
                                     TII.get(Opc), ResultReg);
   addLoadStoreOperands(Addr, MIB, MachineMemOperand::MOLoad, ScaleFactor, MMO);
 
-  // For 32bit loads we do sign-extending loads to 64bit and then extract the
-  // subreg. In the end this is just a NOOP.
-  if (VT == MVT::i32 && !WantZExt)
-    ResultReg = fastEmitInst_extractsubreg(MVT::i32, ResultReg, /*IsKill=*/true,
-                                           AArch64::sub_32);
+  // For zero-extending loads to 64bit we emit a 32bit load and then convert
+  // the w-reg to an x-reg. In the end this is just an noop and will be removed.
+  if (WantZExt && RetVT == MVT::i64 && VT <= MVT::i32) {
+    unsigned Reg64 = createResultReg(&AArch64::GPR64RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(AArch64::SUBREG_TO_REG), Reg64)
+        .addImm(0)
+        .addReg(ResultReg, getKillRegState(true))
+        .addImm(AArch64::sub_32);
+    ResultReg = Reg64;
+  }
 
   // Loading an i1 requires special handling.
   if (VT == MVT::i1) {
-    unsigned ANDReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, 1);
+    unsigned ANDReg = emitAnd_ri(IsRet64Bit ? MVT::i64 : MVT::i32, ResultReg,
+                                 /*IsKill=*/true, 1);
     assert(ANDReg && "Unexpected AND instruction emission failure.");
     ResultReg = ANDReg;
   }
@@ -1767,11 +1797,21 @@ bool AArch64FastISel::selectLoad(const I
     return false;
 
   bool WantZExt = true;
-  if (I->hasOneUse() && isa<SExtInst>(I->use_begin()->getUser()))
-    WantZExt = false;
+  MVT RetVT = VT;
+  if (I->hasOneUse()) {
+    if (const auto *ZE = dyn_cast<ZExtInst>(I->use_begin()->getUser())) {
+      if (!isTypeSupported(ZE->getType(), RetVT, /*IsVectorAllowed=*/false))
+        RetVT = VT;
+    } else if (const auto *SE = dyn_cast<SExtInst>(I->use_begin()->getUser())) {
+      if (!isTypeSupported(SE->getType(), RetVT, /*IsVectorAllowed=*/false))
+        RetVT = VT;
+      WantZExt = false;
+    }
+  }
 
   unsigned ResultReg;
-  if (!emitLoad(VT, ResultReg, Addr, WantZExt, createMachineMemOperandFor(I)))
+  if (!emitLoad(VT, RetVT, ResultReg, Addr, WantZExt,
+                createMachineMemOperandFor(I)))
     return false;
 
   updateValueMap(I, ResultReg);
@@ -2897,7 +2937,7 @@ bool AArch64FastISel::tryEmitSmallMemCpy
 
     bool RV;
     unsigned ResultReg;
-    RV = emitLoad(VT, ResultReg, Src);
+    RV = emitLoad(VT, VT, ResultReg, Src);
     if (!RV)
       return false;
 
@@ -3917,51 +3957,37 @@ bool AArch64FastISel::selectIntExt(const
   if (!isTypeSupported(I->getOperand(0)->getType(), SrcVT))
     return false;
 
-  if (isIntExtFree(I)) {
-    unsigned SrcReg = getRegForValue(I->getOperand(0));
-    if (!SrcReg)
-      return false;
-    bool SrcIsKill = hasTrivialKill(I->getOperand(0));
-
-    const TargetRegisterClass *RC = (RetVT == MVT::i64) ?
-        &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
-    unsigned ResultReg = createResultReg(RC);
-    if (RetVT == MVT::i64 && SrcVT != MVT::i64) {
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-              TII.get(AArch64::SUBREG_TO_REG), ResultReg)
-          .addImm(0)
-          .addReg(SrcReg, getKillRegState(SrcIsKill))
-          .addImm(AArch64::sub_32);
-    } else {
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-              TII.get(TargetOpcode::COPY), ResultReg)
-          .addReg(SrcReg, getKillRegState(SrcIsKill));
-    }
-    updateValueMap(I, ResultReg);
-    return true;
-  }
-
   unsigned SrcReg = getRegForValue(I->getOperand(0));
   if (!SrcReg)
     return false;
-  bool SrcRegIsKill = hasTrivialKill(I->getOperand(0));
+  bool SrcIsKill = hasTrivialKill(I->getOperand(0));
 
-  unsigned ResultReg = 0;
-  if (isIntExtFree(I)) {
-    if (RetVT == MVT::i64) {
-      ResultReg = createResultReg(&AArch64::GPR64RegClass);
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-              TII.get(AArch64::SUBREG_TO_REG), ResultReg)
-          .addImm(0)
-          .addReg(SrcReg, getKillRegState(SrcRegIsKill))
-          .addImm(AArch64::sub_32);
-    } else
-      ResultReg = SrcReg;
+  // The load instruction selection code handles the sign-/zero-extension.
+  if (const auto *LI = dyn_cast<LoadInst>(I->getOperand(0))) {
+    if (LI->hasOneUse()) {
+      updateValueMap(I, SrcReg);
+      return true;
+    }
   }
 
-  if (!ResultReg)
-    ResultReg = emitIntExt(SrcVT, SrcReg, RetVT, isa<ZExtInst>(I));
+  bool IsZExt = isa<ZExtInst>(I);
+  if (const auto *Arg = dyn_cast<Argument>(I->getOperand(0))) {
+    if ((IsZExt && Arg->hasZExtAttr()) || (!IsZExt && Arg->hasSExtAttr())) {
+      if (RetVT == MVT::i64 && SrcVT != MVT::i64) {
+        unsigned ResultReg = createResultReg(&AArch64::GPR64RegClass);
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                TII.get(AArch64::SUBREG_TO_REG), ResultReg)
+            .addImm(0)
+            .addReg(SrcReg, getKillRegState(SrcIsKill))
+            .addImm(AArch64::sub_32);
+        SrcReg = ResultReg;
+      }
+      updateValueMap(I, SrcReg);
+      return true;
+    }
+  }
 
+  unsigned ResultReg = emitIntExt(SrcVT, SrcReg, RetVT, IsZExt);
   if (!ResultReg)
     return false;
 

Modified: llvm/trunk/test/CodeGen/AArch64/fast-isel-int-ext.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/fast-isel-int-ext.ll?rev=219185&r1=219184&r2=219185&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/fast-isel-int-ext.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/fast-isel-int-ext.ll Mon Oct  6 22:39:59 2014
@@ -6,9 +6,9 @@
 ;
 ; SHIFT
 ;
-define i64 @load_addr_shift_zext1(i32 zeroext %a, i64 %b) {
+define i64 @load_addr_shift_zext1(i32 %a, i64 %b) {
 ; CHECK-LABEL: load_addr_shift_zext1
-; CHECK:       ldr {{x[0-9]+}}, [x1, x0, lsl #3]
+; CHECK:       ldr {{x[0-9]+}}, [x1, w0, uxtw #3]
   %1 = zext i32 %a to i64
   %2 = shl i64 %1, 3
   %3 = add i64 %b, %2
@@ -17,9 +17,9 @@ define i64 @load_addr_shift_zext1(i32 ze
   ret i64 %5
 }
 
-define i64 @load_addr_shift_zext2(i32 signext %a, i64 %b) {
+define i64 @load_addr_shift_zext2(i32 zeroext %a, i64 %b) {
 ; CHECK-LABEL: load_addr_shift_zext2
-; CHECK:       ldr {{x[0-9]+}}, [x1, w0, uxtw #3{{\]}}
+; CHECK:       ldr {{x[0-9]+}}, [x1, x0, lsl #3]
   %1 = zext i32 %a to i64
   %2 = shl i64 %1, 3
   %3 = add i64 %b, %2
@@ -28,9 +28,20 @@ define i64 @load_addr_shift_zext2(i32 si
   ret i64 %5
 }
 
-define i64 @load_addr_shift_sext1(i32 signext %a, i64 %b) {
+define i64 @load_addr_shift_zext3(i32 signext %a, i64 %b) {
+; CHECK-LABEL: load_addr_shift_zext3
+; CHECK:       ldr {{x[0-9]+}}, [x1, w0, uxtw #3]
+  %1 = zext i32 %a to i64
+  %2 = shl i64 %1, 3
+  %3 = add i64 %b, %2
+  %4 = inttoptr i64 %3 to i64*
+  %5 = load i64* %4
+  ret i64 %5
+}
+
+define i64 @load_addr_shift_sext1(i32 %a, i64 %b) {
 ; CHECK-LABEL: load_addr_shift_sext1
-; CHECK:       ldr {{x[0-9]+}}, [x1, x0, lsl #3]
+; CHECK:       ldr {{x[0-9]+}}, [x1, w0, sxtw #3]
   %1 = sext i32 %a to i64
   %2 = shl i64 %1, 3
   %3 = add i64 %b, %2
@@ -50,12 +61,23 @@ define i64 @load_addr_shift_sext2(i32 ze
   ret i64 %5
 }
 
+define i64 @load_addr_shift_sext3(i32 signext %a, i64 %b) {
+; CHECK-LABEL: load_addr_shift_sext3
+; CHECK:       ldr {{x[0-9]+}}, [x1, x0, lsl #3]
+  %1 = sext i32 %a to i64
+  %2 = shl i64 %1, 3
+  %3 = add i64 %b, %2
+  %4 = inttoptr i64 %3 to i64*
+  %5 = load i64* %4
+  ret i64 %5
+}
+
 ;
 ; MUL
 ;
-define i64 @load_addr_mul_zext1(i32 zeroext %a, i64 %b) {
+define i64 @load_addr_mul_zext1(i32 %a, i64 %b) {
 ; CHECK-LABEL: load_addr_mul_zext1
-; CHECK:       ldr {{x[0-9]+}}, [x1, x0, lsl #3]
+; CHECK:       ldr {{x[0-9]+}}, [x1, w0, uxtw #3]
   %1 = zext i32 %a to i64
   %2 = mul i64 %1, 8
   %3 = add i64 %b, %2
@@ -64,8 +86,19 @@ define i64 @load_addr_mul_zext1(i32 zero
   ret i64 %5
 }
 
-define i64 @load_addr_mul_zext2(i32 signext %a, i64 %b) {
+define i64 @load_addr_mul_zext2(i32 zeroext %a, i64 %b) {
 ; CHECK-LABEL: load_addr_mul_zext2
+; CHECK:       ldr {{x[0-9]+}}, [x1, x0, lsl #3]
+  %1 = zext i32 %a to i64
+  %2 = mul i64 %1, 8
+  %3 = add i64 %b, %2
+  %4 = inttoptr i64 %3 to i64*
+  %5 = load i64* %4
+  ret i64 %5
+}
+
+define i64 @load_addr_mul_zext3(i32 signext %a, i64 %b) {
+; CHECK-LABEL: load_addr_mul_zext3
 ; CHECK:       ldr {{x[0-9]+}}, [x1, w0, uxtw #3]
   %1 = zext i32 %a to i64
   %2 = mul i64 %1, 8
@@ -75,9 +108,9 @@ define i64 @load_addr_mul_zext2(i32 sign
   ret i64 %5
 }
 
-define i64 @load_addr_mul_sext1(i32 signext %a, i64 %b) {
+define i64 @load_addr_mul_sext1(i32 %a, i64 %b) {
 ; CHECK-LABEL: load_addr_mul_sext1
-; CHECK:       ldr {{x[0-9]+}}, [x1, x0, lsl #3]
+; CHECK:       ldr {{x[0-9]+}}, [x1, w0, sxtw #3]
   %1 = sext i32 %a to i64
   %2 = mul i64 %1, 8
   %3 = add i64 %b, %2
@@ -97,94 +130,372 @@ define i64 @load_addr_mul_sext2(i32 zero
   ret i64 %5
 }
 
+define i64 @load_addr_mul_sext3(i32 signext %a, i64 %b) {
+; CHECK-LABEL: load_addr_mul_sext3
+; CHECK:       ldr {{x[0-9]+}}, [x1, x0, lsl #3]
+  %1 = sext i32 %a to i64
+  %2 = mul i64 %1, 8
+  %3 = add i64 %b, %2
+  %4 = inttoptr i64 %3 to i64*
+  %5 = load i64* %4
+  ret i64 %5
+}
+
+
+;
 ; Test folding of the sign-/zero-extend into the load instruction.
-define i32 @load_zext_i8_to_i32(i8* %a) {
-; CHECK-LABEL: load_zext_i8_to_i32
-; CHECK:       ldrb w0, [x0]
+;
+
+; Unscaled
+define i32 @load_unscaled_zext_i8_to_i32(i64 %a) {
+; CHECK-LABEL: load_unscaled_zext_i8_to_i32
+; CHECK:       ldurb w0, [x0, #-8]
 ; CHECK-NOT:   uxtb
-  %1 = load i8* %a
-  %2 = zext i8 %1 to i32
-  ret i32 %2
+  %1 = add i64 %a, -8
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  %4 = zext i8 %3 to i32
+  ret i32 %4
 }
 
-define i32 @load_zext_i16_to_i32(i16* %a) {
-; CHECK-LABEL: load_zext_i16_to_i32
-; CHECK:       ldrh w0, [x0]
+define i32 @load_unscaled_zext_i16_to_i32(i64 %a) {
+; CHECK-LABEL: load_unscaled_zext_i16_to_i32
+; CHECK:       ldurh w0, [x0, #-8]
 ; CHECK-NOT:   uxth
-  %1 = load i16* %a
-  %2 = zext i16 %1 to i32
-  ret i32 %2
+  %1 = add i64 %a, -8
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  %4 = zext i16 %3 to i32
+  ret i32 %4
 }
 
-define i64 @load_zext_i8_to_i64(i8* %a) {
-; CHECK-LABEL: load_zext_i8_to_i64
-; CHECK:       ldrb w0, [x0]
+define i64 @load_unscaled_zext_i8_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_zext_i8_to_i64
+; CHECK:       ldurb w0, [x0, #-8]
 ; CHECK-NOT:   uxtb
-  %1 = load i8* %a
-  %2 = zext i8 %1 to i64
-  ret i64 %2
+  %1 = add i64 %a, -8
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  %4 = zext i8 %3 to i64
+  ret i64 %4
 }
 
-define i64 @load_zext_i16_to_i64(i16* %a) {
-; CHECK-LABEL: load_zext_i16_to_i64
-; CHECK:       ldrh w0, [x0]
+define i64 @load_unscaled_zext_i16_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_zext_i16_to_i64
+; CHECK:       ldurh w0, [x0, #-8]
 ; CHECK-NOT:   uxth
-  %1 = load i16* %a
-  %2 = zext i16 %1 to i64
-  ret i64 %2
+  %1 = add i64 %a, -8
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  %4 = zext i16 %3 to i64
+  ret i64 %4
 }
 
-define i64 @load_zext_i32_to_i64(i32* %a) {
-; CHECK-LABEL: load_zext_i32_to_i64
-; CHECK:       ldr w0, [x0]
+define i64 @load_unscaled_zext_i32_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_zext_i32_to_i64
+; CHECK:       ldur w0, [x0, #-8]
 ; CHECK-NOT:   uxtw
-  %1 = load i32* %a
-  %2 = zext i32 %1 to i64
-  ret i64 %2
+  %1 = add i64 %a, -8
+  %2 = inttoptr i64 %1 to i32*
+  %3 = load i32* %2
+  %4 = zext i32 %3 to i64
+  ret i64 %4
 }
 
-define i32 @load_sext_i8_to_i32(i8* %a) {
-; CHECK-LABEL: load_sext_i8_to_i32
-; CHECK:       ldrsb w0, [x0]
+define i32 @load_unscaled_sext_i8_to_i32(i64 %a) {
+; CHECK-LABEL: load_unscaled_sext_i8_to_i32
+; CHECK:       ldursb w0, [x0, #-8]
 ; CHECK-NOT:   sxtb
-  %1 = load i8* %a
-  %2 = sext i8 %1 to i32
-  ret i32 %2
+  %1 = add i64 %a, -8
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  %4 = sext i8 %3 to i32
+  ret i32 %4
 }
 
-define i32 @load_sext_i16_to_i32(i16* %a) {
-; CHECK-LABEL: load_sext_i16_to_i32
-; CHECK:       ldrsh w0, [x0]
+define i32 @load_unscaled_sext_i16_to_i32(i64 %a) {
+; CHECK-LABEL: load_unscaled_sext_i16_to_i32
+; CHECK:       ldursh w0, [x0, #-8]
 ; CHECK-NOT:   sxth
-  %1 = load i16* %a
-  %2 = sext i16 %1 to i32
-  ret i32 %2
+  %1 = add i64 %a, -8
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  %4 = sext i16 %3 to i32
+  ret i32 %4
 }
 
-define i64 @load_sext_i8_to_i64(i8* %a) {
-; CHECK-LABEL: load_sext_i8_to_i64
-; CHECK:       ldrsb w0, [x0]
+define i64 @load_unscaled_sext_i8_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_sext_i8_to_i64
+; CHECK:       ldursb x0, [x0, #-8]
 ; CHECK-NOT:   sxtb
-  %1 = load i8* %a
-  %2 = sext i8 %1 to i64
-  ret i64 %2
+  %1 = add i64 %a, -8
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  %4 = sext i8 %3 to i64
+  ret i64 %4
 }
 
-define i64 @load_sext_i16_to_i64(i16* %a) {
-; CHECK-LABEL: load_sext_i16_to_i64
-; CHECK:       ldrsh w0, [x0]
+define i64 @load_unscaled_sext_i16_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_sext_i16_to_i64
+; CHECK:       ldursh x0, [x0, #-8]
 ; CHECK-NOT:   sxth
-  %1 = load i16* %a
-  %2 = sext i16 %1 to i64
-  ret i64 %2
+  %1 = add i64 %a, -8
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  %4 = sext i16 %3 to i64
+  ret i64 %4
 }
 
-define i64 @load_sext_i32_to_i64(i32* %a) {
-; CHECK-LABEL: load_sext_i32_to_i64
-; CHECK:       ldrsw x0, [x0]
+define i64 @load_unscaled_sext_i32_to_i64(i64 %a) {
+; CHECK-LABEL: load_unscaled_sext_i32_to_i64
+; CHECK:       ldursw x0, [x0, #-8]
 ; CHECK-NOT:   sxtw
-  %1 = load i32* %a
-  %2 = sext i32 %1 to i64
-  ret i64 %2
+  %1 = add i64 %a, -8
+  %2 = inttoptr i64 %1 to i32*
+  %3 = load i32* %2
+  %4 = sext i32 %3 to i64
+  ret i64 %4
+}
+
+; Register
+define i32 @load_register_zext_i8_to_i32(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_zext_i8_to_i32
+; CHECK:       ldrb w0, [x0, x1]
+; CHECK-NOT:   uxtb
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  %4 = zext i8 %3 to i32
+  ret i32 %4
+}
+
+define i32 @load_register_zext_i16_to_i32(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_zext_i16_to_i32
+; CHECK:       ldrh w0, [x0, x1]
+; CHECK-NOT:   uxth
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  %4 = zext i16 %3 to i32
+  ret i32 %4
+}
+
+define i64 @load_register_zext_i8_to_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_zext_i8_to_i64
+; CHECK:       ldrb w0, [x0, x1]
+; CHECK-NOT:   uxtb
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  %4 = zext i8 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_register_zext_i16_to_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_zext_i16_to_i64
+; CHECK:       ldrh w0, [x0, x1]
+; CHECK-NOT:   uxth
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  %4 = zext i16 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_register_zext_i32_to_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_zext_i32_to_i64
+; CHECK:       ldr w0, [x0, x1]
+; CHECK-NOT:   uxtw
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i32*
+  %3 = load i32* %2
+  %4 = zext i32 %3 to i64
+  ret i64 %4
+}
+
+define i32 @load_register_sext_i8_to_i32(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_sext_i8_to_i32
+; CHECK:       ldrsb w0, [x0, x1]
+; CHECK-NOT:   sxtb
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  %4 = sext i8 %3 to i32
+  ret i32 %4
+}
+
+define i32 @load_register_sext_i16_to_i32(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_sext_i16_to_i32
+; CHECK:       ldrsh w0, [x0, x1]
+; CHECK-NOT:   sxth
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  %4 = sext i16 %3 to i32
+  ret i32 %4
+}
+
+define i64 @load_register_sext_i8_to_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_sext_i8_to_i64
+; CHECK:       ldrsb x0, [x0, x1]
+; CHECK-NOT:   sxtb
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i8*
+  %3 = load i8* %2
+  %4 = sext i8 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_register_sext_i16_to_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_sext_i16_to_i64
+; CHECK:       ldrsh x0, [x0, x1]
+; CHECK-NOT:   sxth
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i16*
+  %3 = load i16* %2
+  %4 = sext i16 %3 to i64
+  ret i64 %4
+}
+
+define i64 @load_register_sext_i32_to_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: load_register_sext_i32_to_i64
+; CHECK:       ldrsw x0, [x0, x1]
+; CHECK-NOT:   sxtw
+  %1 = add i64 %a, %b
+  %2 = inttoptr i64 %1 to i32*
+  %3 = load i32* %2
+  %4 = sext i32 %3 to i64
+  ret i64 %4
+}
+
+; Extend
+define i32 @load_extend_zext_i8_to_i32(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_zext_i8_to_i32
+; CHECK:       sxtw [[REG:x[0-9]+]], w1
+; CHECK-NEXT:  ldrb w0, [x0, [[REG]]]
+; CHECK-NOT:   uxtb
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i8*
+  %4 = load i8* %3
+  %5 = zext i8 %4 to i32
+  ret i32 %5
+}
+
+define i32 @load_extend_zext_i16_to_i32(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_zext_i16_to_i32
+; CHECK:       sxtw [[REG:x[0-9]+]], w1
+; CHECK-NEXT:  ldrh w0, [x0, [[REG]]]
+; CHECK-NOT:   uxth
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i16*
+  %4 = load i16* %3
+  %5 = zext i16 %4 to i32
+  ret i32 %5
+}
+
+define i64 @load_extend_zext_i8_to_i64(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_zext_i8_to_i64
+; CHECK:       sxtw [[REG:x[0-9]+]], w1
+; CHECK-NEXT:  ldrb w0, [x0, [[REG]]]
+; CHECK-NOT:   uxtb
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i8*
+  %4 = load i8* %3
+  %5 = zext i8 %4 to i64
+  ret i64 %5
+}
+
+define i64 @load_extend_zext_i16_to_i64(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_zext_i16_to_i64
+; CHECK:       sxtw [[REG:x[0-9]+]], w1
+; CHECK-NEXT:  ldrh w0, [x0, [[REG]]]
+; CHECK-NOT:   uxth
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i16*
+  %4 = load i16* %3
+  %5 = zext i16 %4 to i64
+  ret i64 %5
+}
+
+define i64 @load_extend_zext_i32_to_i64(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_zext_i32_to_i64
+; CHECK:       sxtw [[REG:x[0-9]+]], w1
+; CHECK-NEXT:  ldr w0, [x0, [[REG]]]
+; CHECK-NOT:   uxtw
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i32*
+  %4 = load i32* %3
+  %5 = zext i32 %4 to i64
+  ret i64 %5
+}
+
+define i32 @load_extend_sext_i8_to_i32(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_sext_i8_to_i32
+; CHECK:       sxtw [[REG:x[0-9]+]], w1
+; CHECK-NEXT:  ldrsb w0, [x0, [[REG]]]
+; CHECK-NOT:   sxtb
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i8*
+  %4 = load i8* %3
+  %5 = sext i8 %4 to i32
+  ret i32 %5
+}
+
+define i32 @load_extend_sext_i16_to_i32(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_sext_i16_to_i32
+; CHECK:       sxtw [[REG:x[0-9]+]], w1
+; CHECK-NEXT:  ldrsh w0, [x0, [[REG]]]
+; CHECK-NOT:   sxth
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i16*
+  %4 = load i16* %3
+  %5 = sext i16 %4 to i32
+  ret i32 %5
+}
+
+define i64 @load_extend_sext_i8_to_i64(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_sext_i8_to_i64
+; CHECK:       sxtw [[REG:x[0-9]+]], w1
+; CHECK-NEXT:  ldrsb x0, [x0, [[REG]]]
+; CHECK-NOT:   sxtb
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i8*
+  %4 = load i8* %3
+  %5 = sext i8 %4 to i64
+  ret i64 %5
+}
+
+define i64 @load_extend_sext_i16_to_i64(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_sext_i16_to_i64
+; CHECK:       sxtw [[REG:x[0-9]+]], w1
+; CHECK-NEXT:  ldrsh x0, [x0, [[REG]]]
+; CHECK-NOT:   sxth
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i16*
+  %4 = load i16* %3
+  %5 = sext i16 %4 to i64
+  ret i64 %5
+}
+
+define i64 @load_extend_sext_i32_to_i64(i64 %a, i32 %b) {
+; CHECK-LABEL: load_extend_sext_i32_to_i64
+; CHECK:       sxtw [[REG:x[0-9]+]], w1
+; CHECK-NEXT:  ldrsw x0, [x0, [[REG]]]
+; CHECK-NOT:   sxtw
+  %1 = sext i32 %b to i64
+  %2 = add i64 %a, %1
+  %3 = inttoptr i64 %2 to i32*
+  %4 = load i32* %3
+  %5 = sext i32 %4 to i64
+  ret i64 %5
 }