[llvm] 8d7c865 - [RISCV] Support __builtin_nontemporal_load/store by MachineMemOperand
Piyou Chen via llvm-commits
llvm-commits at lists.llvm.org
Wed Apr 5 23:17:22 PDT 2023
Author: Piyou Chen
Date: 2023-04-05T22:57:49-07:00
New Revision: 8d7c865c2e22108368373f2301f367edbe293c1c
URL: https://github.com/llvm/llvm-project/commit/8d7c865c2e22108368373f2301f367edbe293c1c
DIFF: https://github.com/llvm/llvm-project/commit/8d7c865c2e22108368373f2301f367edbe293c1c.diff
LOG: [RISCV] Support __builtin_nontemporal_load/store by MachineMemOperand
Differential Revision: https://reviews.llvm.org/D143361
Added:
llvm/lib/Target/RISCV/RISCVInsertNTLHInsts.cpp
llvm/lib/Target/RISCV/RISCVInstrInfoZihintntl.td
llvm/test/CodeGen/RISCV/nontemporal-scalable.ll
llvm/test/CodeGen/RISCV/nontemporal.ll
Modified:
llvm/lib/Target/RISCV/CMakeLists.txt
llvm/lib/Target/RISCV/RISCV.h
llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
llvm/lib/Target/RISCV/RISCVInstrInfo.td
llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
llvm/test/CodeGen/RISCV/O0-pipeline.ll
llvm/test/CodeGen/RISCV/O3-pipeline.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt
index 3186c1fe736b..c834bcce3b1b 100644
--- a/llvm/lib/Target/RISCV/CMakeLists.txt
+++ b/llvm/lib/Target/RISCV/CMakeLists.txt
@@ -26,6 +26,7 @@ add_llvm_target(RISCVCodeGen
RISCVExpandPseudoInsts.cpp
RISCVFrameLowering.cpp
RISCVGatherScatterLowering.cpp
+ RISCVInsertNTLHInsts.cpp
RISCVInsertVSETVLI.cpp
RISCVInstrInfo.cpp
RISCVISelDAGToDAG.cpp
diff --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h
index f4f378ebbacb..e0cf1cd3de6e 100644
--- a/llvm/lib/Target/RISCV/RISCV.h
+++ b/llvm/lib/Target/RISCV/RISCV.h
@@ -62,6 +62,9 @@ void initializeRISCVPreRAExpandPseudoPass(PassRegistry &);
FunctionPass *createRISCVExpandAtomicPseudoPass();
void initializeRISCVExpandAtomicPseudoPass(PassRegistry &);
+FunctionPass *createRISCVInsertNTLHInstsPass();
+void initializeRISCVInsertNTLHInstsPass(PassRegistry &);
+
FunctionPass *createRISCVInsertVSETVLIPass();
void initializeRISCVInsertVSETVLIPass(PassRegistry &);
diff --git a/llvm/lib/Target/RISCV/RISCVInsertNTLHInsts.cpp b/llvm/lib/Target/RISCV/RISCVInsertNTLHInsts.cpp
new file mode 100644
index 000000000000..209438a6165b
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVInsertNTLHInsts.cpp
@@ -0,0 +1,92 @@
+//===-- RISCVInsertNTLHInsts.cpp - Insert NTLH extension instrution -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a function pass that inserts non-temporal hint
+// instructions where needed.
+//
+// It checks the MachineMemOperand of all MachineInstr.
+// If the instruction has a MachineMemOperand and isNontemporal is true,
+// then ntlh instruction is inserted before it.
+//
+//===----------------------------------------------------------------------===//
+
+#include "RISCV.h"
+#include "RISCVInstrInfo.h"
+#include "RISCVTargetMachine.h"
+
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+
+using namespace llvm;
+
+#define RISCV_INSERT_NTLH_INSTS_NAME "RISC-V insert NTLH instruction pass"
+
+namespace {
+
+class RISCVInsertNTLHInsts : public MachineFunctionPass {
+public:
+ const RISCVInstrInfo *TII;
+ static char ID;
+
+ RISCVInsertNTLHInsts() : MachineFunctionPass(ID) {
+ initializeRISCVInsertNTLHInstsPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ StringRef getPassName() const override {
+ return RISCV_INSERT_NTLH_INSTS_NAME;
+ }
+};
+
+} // end of anonymous namespace
+
+char RISCVInsertNTLHInsts::ID = 0;
+
+bool RISCVInsertNTLHInsts::runOnMachineFunction(MachineFunction &MF) {
+ const auto &ST = MF.getSubtarget<RISCVSubtarget>();
+ TII = ST.getInstrInfo();
+
+ if (!ST.hasStdExtZihintntl())
+ return false;
+
+ bool Changed = false;
+ for (auto &MBB : MF) {
+ for (auto &MBBI : MBB) {
+ if (MBBI.memoperands_empty())
+ continue;
+ MachineMemOperand *MMO = *(MBBI.memoperands_begin());
+ if (MMO->isNonTemporal()) {
+ DebugLoc DL = MBBI.getDebugLoc();
+ if (ST.hasStdExtCOrZca() && ST.enableRVCHintInstrs())
+ BuildMI(MBB, MBBI, DL, TII->get(RISCV::PseudoCNTLALL));
+ else
+ BuildMI(MBB, MBBI, DL, TII->get(RISCV::PseudoNTLALL));
+ Changed = true;
+ }
+ }
+ }
+
+ return Changed;
+}
+
+INITIALIZE_PASS(RISCVInsertNTLHInsts, "riscv-insert-ntlh-insts",
+ RISCV_INSERT_NTLH_INSTS_NAME, false, false)
+
+namespace llvm {
+
+FunctionPass *createRISCVInsertNTLHInstsPass() {
+ return new RISCVInsertNTLHInsts();
+}
+
+} // end of namespace llvm
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 2ad5b814ecf1..17b5c790711e 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -1243,6 +1243,20 @@ unsigned RISCVInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
*TM.getMCAsmInfo());
}
+ if (!MI.memoperands_empty()) {
+ MachineMemOperand *MMO = *(MI.memoperands_begin());
+ const MachineFunction &MF = *MI.getParent()->getParent();
+ const auto &ST = MF.getSubtarget<RISCVSubtarget>();
+ if (ST.hasStdExtZihintntl() && MMO->isNonTemporal()) {
+ if (ST.hasStdExtCOrZca() && ST.enableRVCHintInstrs()) {
+ if (isCompressibleInst(MI, STI))
+ return 4; // c.ntl.all + c.load/c.store
+ return 6; // c.ntl.all + load/store
+ }
+ return 8; // ntl.all + load/store
+ }
+ }
+
if (MI.getParent() && MI.getParent()->getParent()) {
if (isCompressibleInst(MI, STI))
return 2;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index 0378f65b5f49..fc21af37d0bc 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -1920,3 +1920,4 @@ include "RISCVInstrInfoZicond.td"
include "RISCVInstrInfoXVentana.td"
include "RISCVInstrInfoXTHead.td"
+include "RISCVInstrInfoZihintntl.td"
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZihintntl.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZihintntl.td
new file mode 100644
index 000000000000..ecc5ddedee00
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZihintntl.td
@@ -0,0 +1,22 @@
+//===RISCVInstrInfoZihintntl.td - 'Zihintntl' instructions -*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// This file describes the RISC-V instructions from Non-Temporal Locality
+/// Hints extension document (zihintntl).
+///
+//===----------------------------------------------------------------------===//
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Size = 4 in {
+ def PseudoNTLALL : Pseudo<(outs), (ins), [], "ntl.all">,
+ PseudoInstExpansion<(ADD X0, X0, X5)>;
+}
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Size = 2 in {
+ def PseudoCNTLALL : Pseudo<(outs), (ins), [], "c.ntl.all">,
+ PseudoInstExpansion<(C_ADD_HINT X0, X0, X5)>;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 594384764056..75a9ed7634f1 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -83,6 +83,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() {
initializeRISCVOptWInstrsPass(*PR);
initializeRISCVPreRAExpandPseudoPass(*PR);
initializeRISCVExpandPseudoPass(*PR);
+ initializeRISCVInsertNTLHInstsPass(*PR);
initializeRISCVInsertVSETVLIPass(*PR);
initializeRISCVDAGToDAGISelPass(*PR);
initializeRISCVInitUndefPass(*PR);
@@ -348,6 +349,7 @@ void RISCVPassConfig::addPreEmitPass() {
void RISCVPassConfig::addPreEmitPass2() {
addPass(createRISCVExpandPseudoPass());
+ addPass(createRISCVInsertNTLHInstsPass());
// Schedule the expansion of AMOs at the last possible moment, avoiding the
// possibility for other passes to break the requirements for forward
diff --git a/llvm/test/CodeGen/RISCV/O0-pipeline.ll b/llvm/test/CodeGen/RISCV/O0-pipeline.ll
index f18e6a34ecdd..779abf813894 100644
--- a/llvm/test/CodeGen/RISCV/O0-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O0-pipeline.ll
@@ -64,6 +64,7 @@
; CHECK-NEXT: Machine Optimization Remark Emitter
; CHECK-NEXT: Stack Frame Layout Analysis
; CHECK-NEXT: RISC-V pseudo instruction expansion pass
+; CHECK-NEXT: RISC-V insert NTLH instruction pass
; CHECK-NEXT: RISC-V atomic pseudo instruction expansion pass
; CHECK-NEXT: Lazy Machine Block Frequency Analysis
; CHECK-NEXT: Machine Optimization Remark Emitter
diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
index 9179c8c922f1..539e91fce18a 100644
--- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
@@ -177,6 +177,7 @@
; CHECK-NEXT: Machine Optimization Remark Emitter
; CHECK-NEXT: Stack Frame Layout Analysis
; CHECK-NEXT: RISC-V pseudo instruction expansion pass
+; CHECK-NEXT: RISC-V insert NTLH instruction pass
; CHECK-NEXT: RISC-V atomic pseudo instruction expansion pass
; CHECK-NEXT: Lazy Machine Block Frequency Analysis
; CHECK-NEXT: Machine Optimization Remark Emitter
diff --git a/llvm/test/CodeGen/RISCV/nontemporal-scalable.ll b/llvm/test/CodeGen/RISCV/nontemporal-scalable.ll
new file mode 100644
index 000000000000..6a4ecf38427e
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/nontemporal-scalable.ll
@@ -0,0 +1,133 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-zihintntl,+f,+d,+zfh,+v < %s | FileCheck %s -check-prefix=CHECK-RV64V
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-zihintntl,+f,+d,+zfh,+v < %s | FileCheck %s -check-prefix=CHECK-RV32V
+
+define <vscale x 2 x i64> @test_nontemporal_load_nxv2i64(ptr %p) {
+; CHECK-RV64V-LABEL: test_nontemporal_load_nxv2i64:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: vl2re64.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_load_nxv2i64:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: vl2re64.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load <vscale x 2 x i64>, ptr %p, !nontemporal !0
+ ret <vscale x 2 x i64> %1
+}
+
+define <vscale x 4 x i32> @test_nontemporal_load_nxv4i32(ptr %p) {
+; CHECK-RV64V-LABEL: test_nontemporal_load_nxv4i32:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: vl2re32.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_load_nxv4i32:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: vl2re32.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load <vscale x 4 x i32>, ptr %p, !nontemporal !0
+ ret <vscale x 4 x i32> %1
+}
+
+define <vscale x 8 x i16> @test_nontemporal_load_nxv8i16(ptr %p) {
+; CHECK-RV64V-LABEL: test_nontemporal_load_nxv8i16:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: vl2re16.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_load_nxv8i16:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: vl2re16.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load <vscale x 8 x i16>, ptr %p, !nontemporal !0
+ ret <vscale x 8 x i16> %1
+}
+
+define <vscale x 16 x i8> @test_nontemporal_load_nxv16i8(ptr %p) {
+; CHECK-RV64V-LABEL: test_nontemporal_load_nxv16i8:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: vl2r.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_load_nxv16i8:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: vl2r.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ %1 = load <vscale x 16 x i8>, ptr %p, !nontemporal !0
+ ret <vscale x 16 x i8> %1
+}
+
+define void @test_nontemporal_store_nxv2i64(ptr %p, <vscale x 2 x i64> %v) {
+; CHECK-RV64V-LABEL: test_nontemporal_store_nxv2i64:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: vs2r.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_store_nxv2i64:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: vs2r.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ store <vscale x 2 x i64> %v, ptr %p, !nontemporal !0
+ ret void
+}
+
+define void @test_nontemporal_store_nxv4i32(ptr %p, <vscale x 4 x i32> %v) {
+; CHECK-RV64V-LABEL: test_nontemporal_store_nxv4i32:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: vs2r.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_store_nxv4i32:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: vs2r.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ store <vscale x 4 x i32> %v, ptr %p, !nontemporal !0
+ ret void
+}
+
+define void @test_nontemporal_store_nxv8i16(ptr %p, <vscale x 8 x i16> %v) {
+; CHECK-RV64V-LABEL: test_nontemporal_store_nxv8i16:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: vs2r.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_store_nxv8i16:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: vs2r.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ store <vscale x 8 x i16> %v, ptr %p, !nontemporal !0
+ ret void
+}
+
+define void @test_nontemporal_store_nxv16i8(ptr %p, <vscale x 16 x i8> %v) {
+; CHECK-RV64V-LABEL: test_nontemporal_store_nxv16i8:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: vs2r.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_store_nxv16i8:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: vs2r.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ store <vscale x 16 x i8> %v, ptr %p, !nontemporal !0
+ ret void
+}
+
+!0 = !{i32 1}
diff --git a/llvm/test/CodeGen/RISCV/nontemporal.ll b/llvm/test/CodeGen/RISCV/nontemporal.ll
new file mode 100644
index 000000000000..190b896f486c
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/nontemporal.ll
@@ -0,0 +1,1441 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-zihintntl,+f,+d,+zfh < %s | FileCheck %s -check-prefix=CHECK-RV64
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-zihintntl,+f,+d,+zfh < %s | FileCheck %s -check-prefix=CHECK-RV32
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-zihintntl,+f,+d,+zfh,+c < %s | FileCheck %s -check-prefix=CHECK-RV64C
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-zihintntl,+f,+d,+zfh,+c < %s | FileCheck %s -check-prefix=CHECK-RV32C
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-zihintntl,+f,+d,+zfh,+v < %s | FileCheck %s -check-prefix=CHECK-RV64V
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-zihintntl,+f,+d,+zfh,+v < %s | FileCheck %s -check-prefix=CHECK-RV32V
+
+define i64 @test_nontemporal_load_i64(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_load_i64:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: ld a0, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_load_i64:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: lw a2, 0(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: lw a1, 4(a0)
+; CHECK-RV32-NEXT: mv a0, a2
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_load_i64:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: ld a0, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_load_i64:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: lw a2, 0(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: lw a1, 4(a0)
+; CHECK-RV32C-NEXT: mv a0, a2
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_load_i64:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: ld a0, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_load_i64:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: lw a2, 0(a0)
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: lw a1, 4(a0)
+; CHECK-RV32V-NEXT: mv a0, a2
+; CHECK-RV32V-NEXT: ret
+
+ %1 = load i64, ptr %p, !nontemporal !0
+ ret i64 %1
+}
+
+define i32 @test_nontemporal_load_i32(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_load_i32:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: lw a0, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_load_i32:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: lw a0, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_load_i32:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: lw a0, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_load_i32:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: lw a0, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_load_i32:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: lw a0, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_load_i32:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: lw a0, 0(a0)
+; CHECK-RV32V-NEXT: ret
+
+ %1 = load i32, ptr %p, !nontemporal !0
+ ret i32 %1
+}
+
+define i16 @test_nontemporal_load_i16(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_load_i16:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: lh a0, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_load_i16:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: lh a0, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_load_i16:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: lh a0, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_load_i16:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: lh a0, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_load_i16:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: lh a0, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_load_i16:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: lh a0, 0(a0)
+; CHECK-RV32V-NEXT: ret
+
+ %1 = load i16, ptr %p, !nontemporal !0
+ ret i16 %1
+}
+
+define i8 @test_nontemporal_load_i8(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_load_i8:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: lbu a0, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_load_i8:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: lbu a0, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_load_i8:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: lbu a0, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_load_i8:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: lbu a0, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_load_i8:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: lbu a0, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_load_i8:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: lbu a0, 0(a0)
+; CHECK-RV32V-NEXT: ret
+
+ %1 = load i8, ptr %p, !nontemporal !0
+ ret i8 %1
+}
+
+define half @test_nontemporal_load_half(ptr %p) nounwind {
+; CHECK-RV64-LABEL: test_nontemporal_load_half:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: flh fa5, 0(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: flh fa4, 6(a0)
+; CHECK-RV64-NEXT: fadd.h fa0, fa5, fa4
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_load_half:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: flh fa5, 0(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: flh fa4, 6(a0)
+; CHECK-RV32-NEXT: fadd.h fa0, fa5, fa4
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_load_half:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: flh fa5, 0(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: flh fa4, 6(a0)
+; CHECK-RV64C-NEXT: fadd.h fa0, fa5, fa4
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_load_half:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: flh fa5, 0(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: flh fa4, 6(a0)
+; CHECK-RV32C-NEXT: fadd.h fa0, fa5, fa4
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_load_half:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: flh fa5, 0(a0)
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: flh fa4, 6(a0)
+; CHECK-RV64V-NEXT: fadd.h fa0, fa5, fa4
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_load_half:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: flh fa5, 0(a0)
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: flh fa4, 6(a0)
+; CHECK-RV32V-NEXT: fadd.h fa0, fa5, fa4
+; CHECK-RV32V-NEXT: ret
+
+ %1 = load half, ptr %p, !nontemporal !0
+ %2 = getelementptr half, ptr %p, i32 3
+ %3 = load half, ptr %2, !nontemporal !0
+ %4 = fadd half %1, %3
+ ret half %4
+}
+
+define float @test_nontemporal_load_float(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_load_float:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: flw fa0, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_load_float:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: flw fa0, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_load_float:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: flw fa0, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_load_float:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: flw fa0, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_load_float:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: flw fa0, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_load_float:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: flw fa0, 0(a0)
+; CHECK-RV32V-NEXT: ret
+
+ %1 = load float, ptr %p, !nontemporal !0
+ ret float %1
+}
+
+define double @test_nontemporal_load_double(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_load_double:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: fld fa0, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_load_double:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: fld fa0, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_load_double:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: fld fa0, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_load_double:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: fld fa0, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_load_double:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: fld fa0, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_load_double:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: fld fa0, 0(a0)
+; CHECK-RV32V-NEXT: ret
+
+ %1 = load double, ptr %p, !nontemporal !0
+ ret double %1
+}
+
+define <16 x i8> @test_nontemporal_load_v16i8(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_load_v16i8:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: ld a2, 8(a1)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: ld a1, 0(a1)
+; CHECK-RV64-NEXT: sd a2, 8(a0)
+; CHECK-RV64-NEXT: sd a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_load_v16i8:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: lw a2, 12(a1)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: lw a3, 8(a1)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: lw a4, 4(a1)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: lw a1, 0(a1)
+; CHECK-RV32-NEXT: sw a2, 12(a0)
+; CHECK-RV32-NEXT: sw a3, 8(a0)
+; CHECK-RV32-NEXT: sw a4, 4(a0)
+; CHECK-RV32-NEXT: sw a1, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_load_v16i8:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: ld a2, 8(a1)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: ld a1, 0(a1)
+; CHECK-RV64C-NEXT: sd a2, 8(a0)
+; CHECK-RV64C-NEXT: sd a1, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_load_v16i8:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: lw a2, 12(a1)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: lw a3, 8(a1)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: lw a4, 4(a1)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: lw a1, 0(a1)
+; CHECK-RV32C-NEXT: sw a2, 12(a0)
+; CHECK-RV32C-NEXT: sw a3, 8(a0)
+; CHECK-RV32C-NEXT: sw a4, 4(a0)
+; CHECK-RV32C-NEXT: sw a1, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_load_v16i8:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: vle8.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_load_v16i8:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: vle8.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+
+ %1 = load <16 x i8>, ptr %p, !nontemporal !0
+ ret <16 x i8> %1
+}
+
+define <8 x i16> @test_nontemporal_load_v8i16(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_load_v8i16:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: ld a2, 8(a1)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: ld a1, 0(a1)
+; CHECK-RV64-NEXT: sd a2, 8(a0)
+; CHECK-RV64-NEXT: sd a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_load_v8i16:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: lw a2, 12(a1)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: lw a3, 8(a1)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: lw a4, 4(a1)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: lw a1, 0(a1)
+; CHECK-RV32-NEXT: sw a2, 12(a0)
+; CHECK-RV32-NEXT: sw a3, 8(a0)
+; CHECK-RV32-NEXT: sw a4, 4(a0)
+; CHECK-RV32-NEXT: sw a1, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_load_v8i16:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: ld a2, 8(a1)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: ld a1, 0(a1)
+; CHECK-RV64C-NEXT: sd a2, 8(a0)
+; CHECK-RV64C-NEXT: sd a1, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_load_v8i16:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: lw a2, 12(a1)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: lw a3, 8(a1)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: lw a4, 4(a1)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: lw a1, 0(a1)
+; CHECK-RV32C-NEXT: sw a2, 12(a0)
+; CHECK-RV32C-NEXT: sw a3, 8(a0)
+; CHECK-RV32C-NEXT: sw a4, 4(a0)
+; CHECK-RV32C-NEXT: sw a1, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_load_v8i16:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: vle16.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_load_v8i16:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: vle16.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+
+ %1 = load <8 x i16>, ptr %p, !nontemporal !0
+ ret <8 x i16> %1
+}
+
+define <4 x i32> @test_nontemporal_load_v4i32(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_load_v4i32:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: ld a2, 8(a1)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: ld a1, 0(a1)
+; CHECK-RV64-NEXT: sd a2, 8(a0)
+; CHECK-RV64-NEXT: sd a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_load_v4i32:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: lw a2, 12(a1)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: lw a3, 8(a1)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: lw a4, 4(a1)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: lw a1, 0(a1)
+; CHECK-RV32-NEXT: sw a2, 12(a0)
+; CHECK-RV32-NEXT: sw a3, 8(a0)
+; CHECK-RV32-NEXT: sw a4, 4(a0)
+; CHECK-RV32-NEXT: sw a1, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_load_v4i32:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: ld a2, 8(a1)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: ld a1, 0(a1)
+; CHECK-RV64C-NEXT: sd a2, 8(a0)
+; CHECK-RV64C-NEXT: sd a1, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_load_v4i32:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: lw a2, 12(a1)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: lw a3, 8(a1)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: lw a4, 4(a1)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: lw a1, 0(a1)
+; CHECK-RV32C-NEXT: sw a2, 12(a0)
+; CHECK-RV32C-NEXT: sw a3, 8(a0)
+; CHECK-RV32C-NEXT: sw a4, 4(a0)
+; CHECK-RV32C-NEXT: sw a1, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_load_v4i32:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: vle32.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_load_v4i32:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: vle32.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+
+ %1 = load <4 x i32>, ptr %p, !nontemporal !0
+ ret <4 x i32> %1
+}
+
+define <2 x i64> @test_nontemporal_load_v2i64(ptr %p) {
+; CHECK-RV64-LABEL: test_nontemporal_load_v2i64:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: ld a2, 0(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: ld a1, 8(a0)
+; CHECK-RV64-NEXT: mv a0, a2
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_load_v2i64:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: lw a2, 12(a1)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: lw a3, 8(a1)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: lw a4, 4(a1)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: lw a1, 0(a1)
+; CHECK-RV32-NEXT: sw a2, 12(a0)
+; CHECK-RV32-NEXT: sw a3, 8(a0)
+; CHECK-RV32-NEXT: sw a4, 4(a0)
+; CHECK-RV32-NEXT: sw a1, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_load_v2i64:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: ld a2, 0(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: ld a1, 8(a0)
+; CHECK-RV64C-NEXT: mv a0, a2
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_load_v2i64:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: lw a2, 12(a1)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: lw a3, 8(a1)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: lw a4, 4(a1)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: lw a1, 0(a1)
+; CHECK-RV32C-NEXT: sw a2, 12(a0)
+; CHECK-RV32C-NEXT: sw a3, 8(a0)
+; CHECK-RV32C-NEXT: sw a4, 4(a0)
+; CHECK-RV32C-NEXT: sw a1, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_load_v2i64:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: vle64.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_load_v2i64:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: vle64.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+
+ %1 = load <2 x i64>, ptr %p, !nontemporal !0
+ ret <2 x i64> %1
+}
+
+define void @test_nontemporal_store_i64(ptr %p, i64 %v) {
+; CHECK-RV64-LABEL: test_nontemporal_store_i64:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sd a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_store_i64:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sw a2, 4(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sw a1, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_store_i64:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sd a1, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_store_i64:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sw a2, 4(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sw a1, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_store_i64:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: sd a1, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_store_i64:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: sw a2, 4(a0)
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: sw a1, 0(a0)
+; CHECK-RV32V-NEXT: ret
+
+ store i64 %v, ptr %p, !nontemporal !0
+ ret void
+}
+
+define void @test_nontemporal_store_i32(ptr %p, i32 %v) {
+; CHECK-RV64-LABEL: test_nontemporal_store_i32:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sw a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_store_i32:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sw a1, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_store_i32:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sw a1, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_store_i32:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sw a1, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_store_i32:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: sw a1, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_store_i32:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: sw a1, 0(a0)
+; CHECK-RV32V-NEXT: ret
+
+ store i32 %v, ptr %p, !nontemporal !0
+ ret void
+}
+
+define void @test_nontemporal_store_i16(ptr %p, i16 %v) {
+; CHECK-RV64-LABEL: test_nontemporal_store_i16:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sh a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_store_i16:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sh a1, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_store_i16:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sh a1, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_store_i16:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sh a1, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_store_i16:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: sh a1, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_store_i16:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: sh a1, 0(a0)
+; CHECK-RV32V-NEXT: ret
+
+ store i16 %v, ptr %p, !nontemporal !0
+ ret void
+}
+
+define void @test_nontemporal_store_i8(ptr %p, i8 %v) {
+; CHECK-RV64-LABEL: test_nontemporal_store_i8:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sb a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_store_i8:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sb a1, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_store_i8:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sb a1, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_store_i8:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sb a1, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_store_i8:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: sb a1, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_store_i8:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: sb a1, 0(a0)
+; CHECK-RV32V-NEXT: ret
+
+ store i8 %v, ptr %p, !nontemporal !0
+ ret void
+}
+
+define void @test_nontemporal_store_half(ptr %p, half %v) {
+; CHECK-RV64-LABEL: test_nontemporal_store_half:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: fsh fa0, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_store_half:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: fsh fa0, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_store_half:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: fsh fa0, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_store_half:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: fsh fa0, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_store_half:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: fsh fa0, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_store_half:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: fsh fa0, 0(a0)
+; CHECK-RV32V-NEXT: ret
+
+ store half %v, ptr %p, !nontemporal !0
+ ret void
+}
+
+define void @test_nontemporal_store_float(ptr %p, float %v) {
+; CHECK-RV64-LABEL: test_nontemporal_store_float:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: fsw fa0, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_store_float:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: fsw fa0, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_store_float:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: fsw fa0, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_store_float:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: fsw fa0, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_store_float:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: fsw fa0, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_store_float:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: fsw fa0, 0(a0)
+; CHECK-RV32V-NEXT: ret
+
+ store float %v, ptr %p, !nontemporal !0
+ ret void
+}
+
+define void @test_nontemporal_store_double(ptr %p, double %v) {
+; CHECK-RV64-LABEL: test_nontemporal_store_double:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: fsd fa0, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_store_double:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: fsd fa0, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_store_double:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: fsd fa0, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_store_double:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: fsd fa0, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_store_double:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: fsd fa0, 0(a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_store_double:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: fsd fa0, 0(a0)
+; CHECK-RV32V-NEXT: ret
+
+ store double %v, ptr %p, !nontemporal !0
+ ret void
+}
+
+define void @test_nontemporal_store_v16i8(ptr %p, <16 x i8> %v) {
+; CHECK-RV64-LABEL: test_nontemporal_store_v16i8:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: addi sp, sp, -16
+; CHECK-RV64-NEXT: .cfi_def_cfa_offset 16
+; CHECK-RV64-NEXT: sd s0, 8(sp) # 8-byte Folded Spill
+; CHECK-RV64-NEXT: sd s1, 0(sp) # 8-byte Folded Spill
+; CHECK-RV64-NEXT: .cfi_offset s0, -8
+; CHECK-RV64-NEXT: .cfi_offset s1, -16
+; CHECK-RV64-NEXT: lbu a2, 0(a1)
+; CHECK-RV64-NEXT: lbu a3, 8(a1)
+; CHECK-RV64-NEXT: lbu a4, 16(a1)
+; CHECK-RV64-NEXT: lbu a5, 24(a1)
+; CHECK-RV64-NEXT: lbu a6, 32(a1)
+; CHECK-RV64-NEXT: lbu a7, 40(a1)
+; CHECK-RV64-NEXT: lbu t0, 48(a1)
+; CHECK-RV64-NEXT: lbu t1, 56(a1)
+; CHECK-RV64-NEXT: lbu t2, 64(a1)
+; CHECK-RV64-NEXT: lbu t3, 72(a1)
+; CHECK-RV64-NEXT: lbu t4, 80(a1)
+; CHECK-RV64-NEXT: lbu t5, 88(a1)
+; CHECK-RV64-NEXT: lbu t6, 120(a1)
+; CHECK-RV64-NEXT: lbu s0, 112(a1)
+; CHECK-RV64-NEXT: lbu s1, 104(a1)
+; CHECK-RV64-NEXT: lbu a1, 96(a1)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sb t6, 15(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sb s0, 14(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sb s1, 13(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sb a1, 12(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sb t5, 11(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sb t4, 10(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sb t3, 9(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sb t2, 8(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sb t1, 7(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sb t0, 6(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sb a7, 5(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sb a6, 4(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sb a5, 3(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sb a4, 2(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sb a3, 1(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sb a2, 0(a0)
+; CHECK-RV64-NEXT: ld s0, 8(sp) # 8-byte Folded Reload
+; CHECK-RV64-NEXT: ld s1, 0(sp) # 8-byte Folded Reload
+; CHECK-RV64-NEXT: addi sp, sp, 16
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_store_v16i8:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: addi sp, sp, -16
+; CHECK-RV32-NEXT: .cfi_def_cfa_offset 16
+; CHECK-RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
+; CHECK-RV32-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
+; CHECK-RV32-NEXT: .cfi_offset s0, -4
+; CHECK-RV32-NEXT: .cfi_offset s1, -8
+; CHECK-RV32-NEXT: lbu a2, 0(a1)
+; CHECK-RV32-NEXT: lbu a3, 4(a1)
+; CHECK-RV32-NEXT: lbu a4, 8(a1)
+; CHECK-RV32-NEXT: lbu a5, 12(a1)
+; CHECK-RV32-NEXT: lbu a6, 16(a1)
+; CHECK-RV32-NEXT: lbu a7, 20(a1)
+; CHECK-RV32-NEXT: lbu t0, 24(a1)
+; CHECK-RV32-NEXT: lbu t1, 28(a1)
+; CHECK-RV32-NEXT: lbu t2, 32(a1)
+; CHECK-RV32-NEXT: lbu t3, 36(a1)
+; CHECK-RV32-NEXT: lbu t4, 40(a1)
+; CHECK-RV32-NEXT: lbu t5, 44(a1)
+; CHECK-RV32-NEXT: lbu t6, 60(a1)
+; CHECK-RV32-NEXT: lbu s0, 56(a1)
+; CHECK-RV32-NEXT: lbu s1, 52(a1)
+; CHECK-RV32-NEXT: lbu a1, 48(a1)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sb t6, 15(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sb s0, 14(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sb s1, 13(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sb a1, 12(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sb t5, 11(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sb t4, 10(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sb t3, 9(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sb t2, 8(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sb t1, 7(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sb t0, 6(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sb a7, 5(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sb a6, 4(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sb a5, 3(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sb a4, 2(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sb a3, 1(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sb a2, 0(a0)
+; CHECK-RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
+; CHECK-RV32-NEXT: lw s1, 8(sp) # 4-byte Folded Reload
+; CHECK-RV32-NEXT: addi sp, sp, 16
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_store_v16i8:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: addi sp, sp, -16
+; CHECK-RV64C-NEXT: .cfi_def_cfa_offset 16
+; CHECK-RV64C-NEXT: sd s0, 8(sp) # 8-byte Folded Spill
+; CHECK-RV64C-NEXT: sd s1, 0(sp) # 8-byte Folded Spill
+; CHECK-RV64C-NEXT: .cfi_offset s0, -8
+; CHECK-RV64C-NEXT: .cfi_offset s1, -16
+; CHECK-RV64C-NEXT: lbu a6, 0(a1)
+; CHECK-RV64C-NEXT: lbu a7, 8(a1)
+; CHECK-RV64C-NEXT: lbu t0, 16(a1)
+; CHECK-RV64C-NEXT: lbu t1, 24(a1)
+; CHECK-RV64C-NEXT: lbu t2, 32(a1)
+; CHECK-RV64C-NEXT: lbu t3, 40(a1)
+; CHECK-RV64C-NEXT: lbu t4, 48(a1)
+; CHECK-RV64C-NEXT: lbu t5, 56(a1)
+; CHECK-RV64C-NEXT: lbu t6, 64(a1)
+; CHECK-RV64C-NEXT: lbu a3, 72(a1)
+; CHECK-RV64C-NEXT: lbu a4, 80(a1)
+; CHECK-RV64C-NEXT: lbu a5, 88(a1)
+; CHECK-RV64C-NEXT: lbu a2, 120(a1)
+; CHECK-RV64C-NEXT: lbu s0, 112(a1)
+; CHECK-RV64C-NEXT: lbu s1, 104(a1)
+; CHECK-RV64C-NEXT: lbu a1, 96(a1)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sb a2, 15(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sb s0, 14(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sb s1, 13(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sb a1, 12(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sb a5, 11(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sb a4, 10(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sb a3, 9(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sb t6, 8(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sb t5, 7(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sb t4, 6(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sb t3, 5(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sb t2, 4(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sb t1, 3(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sb t0, 2(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sb a7, 1(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sb a6, 0(a0)
+; CHECK-RV64C-NEXT: ld s0, 8(sp) # 8-byte Folded Reload
+; CHECK-RV64C-NEXT: ld s1, 0(sp) # 8-byte Folded Reload
+; CHECK-RV64C-NEXT: addi sp, sp, 16
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_store_v16i8:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: addi sp, sp, -16
+; CHECK-RV32C-NEXT: .cfi_def_cfa_offset 16
+; CHECK-RV32C-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
+; CHECK-RV32C-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
+; CHECK-RV32C-NEXT: .cfi_offset s0, -4
+; CHECK-RV32C-NEXT: .cfi_offset s1, -8
+; CHECK-RV32C-NEXT: lbu a6, 0(a1)
+; CHECK-RV32C-NEXT: lbu a7, 4(a1)
+; CHECK-RV32C-NEXT: lbu t0, 8(a1)
+; CHECK-RV32C-NEXT: lbu t1, 12(a1)
+; CHECK-RV32C-NEXT: lbu t2, 16(a1)
+; CHECK-RV32C-NEXT: lbu t3, 20(a1)
+; CHECK-RV32C-NEXT: lbu t4, 24(a1)
+; CHECK-RV32C-NEXT: lbu t5, 28(a1)
+; CHECK-RV32C-NEXT: lbu t6, 32(a1)
+; CHECK-RV32C-NEXT: lbu a3, 36(a1)
+; CHECK-RV32C-NEXT: lbu a4, 40(a1)
+; CHECK-RV32C-NEXT: lbu a5, 44(a1)
+; CHECK-RV32C-NEXT: lbu a2, 60(a1)
+; CHECK-RV32C-NEXT: lbu s0, 56(a1)
+; CHECK-RV32C-NEXT: lbu s1, 52(a1)
+; CHECK-RV32C-NEXT: lbu a1, 48(a1)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sb a2, 15(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sb s0, 14(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sb s1, 13(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sb a1, 12(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sb a5, 11(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sb a4, 10(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sb a3, 9(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sb t6, 8(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sb t5, 7(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sb t4, 6(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sb t3, 5(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sb t2, 4(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sb t1, 3(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sb t0, 2(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sb a7, 1(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sb a6, 0(a0)
+; CHECK-RV32C-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
+; CHECK-RV32C-NEXT: lw s1, 8(sp) # 4-byte Folded Reload
+; CHECK-RV32C-NEXT: addi sp, sp, 16
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_store_v16i8:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: vse8.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_store_v16i8:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: vse8.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ store <16 x i8> %v, ptr %p, !nontemporal !0
+ ret void
+}
+
+define void @test_nontemporal_store_v8i16(ptr %p, <8 x i16> %v) {
+; CHECK-RV64-LABEL: test_nontemporal_store_v8i16:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: lh a2, 0(a1)
+; CHECK-RV64-NEXT: lh a3, 8(a1)
+; CHECK-RV64-NEXT: lh a4, 16(a1)
+; CHECK-RV64-NEXT: lh a5, 24(a1)
+; CHECK-RV64-NEXT: lh a6, 56(a1)
+; CHECK-RV64-NEXT: lh a7, 48(a1)
+; CHECK-RV64-NEXT: lh t0, 40(a1)
+; CHECK-RV64-NEXT: lh a1, 32(a1)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sh a6, 14(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sh a7, 12(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sh t0, 10(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sh a1, 8(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sh a5, 6(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sh a4, 4(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sh a3, 2(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sh a2, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_store_v8i16:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: lh a2, 0(a1)
+; CHECK-RV32-NEXT: lh a3, 4(a1)
+; CHECK-RV32-NEXT: lh a4, 8(a1)
+; CHECK-RV32-NEXT: lh a5, 12(a1)
+; CHECK-RV32-NEXT: lh a6, 28(a1)
+; CHECK-RV32-NEXT: lh a7, 24(a1)
+; CHECK-RV32-NEXT: lh t0, 20(a1)
+; CHECK-RV32-NEXT: lh a1, 16(a1)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sh a6, 14(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sh a7, 12(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sh t0, 10(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sh a1, 8(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sh a5, 6(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sh a4, 4(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sh a3, 2(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sh a2, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_store_v8i16:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: lh a6, 0(a1)
+; CHECK-RV64C-NEXT: lh a7, 8(a1)
+; CHECK-RV64C-NEXT: lh t0, 16(a1)
+; CHECK-RV64C-NEXT: lh a5, 24(a1)
+; CHECK-RV64C-NEXT: lh a2, 56(a1)
+; CHECK-RV64C-NEXT: lh a3, 48(a1)
+; CHECK-RV64C-NEXT: lh a4, 40(a1)
+; CHECK-RV64C-NEXT: lh a1, 32(a1)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sh a2, 14(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sh a3, 12(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sh a4, 10(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sh a1, 8(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sh a5, 6(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sh t0, 4(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sh a7, 2(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sh a6, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_store_v8i16:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: lh a6, 0(a1)
+; CHECK-RV32C-NEXT: lh a7, 4(a1)
+; CHECK-RV32C-NEXT: lh t0, 8(a1)
+; CHECK-RV32C-NEXT: lh a5, 12(a1)
+; CHECK-RV32C-NEXT: lh a2, 28(a1)
+; CHECK-RV32C-NEXT: lh a3, 24(a1)
+; CHECK-RV32C-NEXT: lh a4, 20(a1)
+; CHECK-RV32C-NEXT: lh a1, 16(a1)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sh a2, 14(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sh a3, 12(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sh a4, 10(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sh a1, 8(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sh a5, 6(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sh t0, 4(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sh a7, 2(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sh a6, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_store_v8i16:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: vse16.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_store_v8i16:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: vse16.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ store <8 x i16> %v, ptr %p, !nontemporal !0
+ ret void
+}
+
+define void @test_nontemporal_store_v4i32(ptr %p, <4 x i32> %v) {
+; CHECK-RV64-LABEL: test_nontemporal_store_v4i32:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: lw a2, 24(a1)
+; CHECK-RV64-NEXT: lw a3, 16(a1)
+; CHECK-RV64-NEXT: lw a4, 8(a1)
+; CHECK-RV64-NEXT: lw a1, 0(a1)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sw a2, 12(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sw a3, 8(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sw a4, 4(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sw a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_store_v4i32:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: lw a2, 12(a1)
+; CHECK-RV32-NEXT: lw a3, 8(a1)
+; CHECK-RV32-NEXT: lw a4, 4(a1)
+; CHECK-RV32-NEXT: lw a1, 0(a1)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sw a2, 12(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sw a3, 8(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sw a4, 4(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sw a1, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_store_v4i32:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: lw a2, 24(a1)
+; CHECK-RV64C-NEXT: lw a3, 16(a1)
+; CHECK-RV64C-NEXT: lw a4, 8(a1)
+; CHECK-RV64C-NEXT: lw a1, 0(a1)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sw a2, 12(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sw a3, 8(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sw a4, 4(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sw a1, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_store_v4i32:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: lw a2, 12(a1)
+; CHECK-RV32C-NEXT: lw a3, 8(a1)
+; CHECK-RV32C-NEXT: lw a4, 4(a1)
+; CHECK-RV32C-NEXT: lw a1, 0(a1)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sw a2, 12(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sw a3, 8(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sw a4, 4(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sw a1, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_store_v4i32:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: vse32.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_store_v4i32:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: vse32.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ store <4 x i32> %v, ptr %p, !nontemporal !0
+ ret void
+}
+
+define void @test_nontemporal_store_v2i64(ptr %p, <2 x i64> %v) {
+; CHECK-RV64-LABEL: test_nontemporal_store_v2i64:
+; CHECK-RV64: # %bb.0:
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sd a2, 8(a0)
+; CHECK-RV64-NEXT: ntl.all
+; CHECK-RV64-NEXT: sd a1, 0(a0)
+; CHECK-RV64-NEXT: ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_store_v2i64:
+; CHECK-RV32: # %bb.0:
+; CHECK-RV32-NEXT: lw a2, 12(a1)
+; CHECK-RV32-NEXT: lw a3, 8(a1)
+; CHECK-RV32-NEXT: lw a4, 4(a1)
+; CHECK-RV32-NEXT: lw a1, 0(a1)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sw a2, 12(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sw a3, 8(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sw a4, 4(a0)
+; CHECK-RV32-NEXT: ntl.all
+; CHECK-RV32-NEXT: sw a1, 0(a0)
+; CHECK-RV32-NEXT: ret
+;
+; CHECK-RV64C-LABEL: test_nontemporal_store_v2i64:
+; CHECK-RV64C: # %bb.0:
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sd a2, 8(a0)
+; CHECK-RV64C-NEXT: c.ntl.all
+; CHECK-RV64C-NEXT: sd a1, 0(a0)
+; CHECK-RV64C-NEXT: ret
+;
+; CHECK-RV32C-LABEL: test_nontemporal_store_v2i64:
+; CHECK-RV32C: # %bb.0:
+; CHECK-RV32C-NEXT: lw a2, 12(a1)
+; CHECK-RV32C-NEXT: lw a3, 8(a1)
+; CHECK-RV32C-NEXT: lw a4, 4(a1)
+; CHECK-RV32C-NEXT: lw a1, 0(a1)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sw a2, 12(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sw a3, 8(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sw a4, 4(a0)
+; CHECK-RV32C-NEXT: c.ntl.all
+; CHECK-RV32C-NEXT: sw a1, 0(a0)
+; CHECK-RV32C-NEXT: ret
+;
+; CHECK-RV64V-LABEL: test_nontemporal_store_v2i64:
+; CHECK-RV64V: # %bb.0:
+; CHECK-RV64V-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-RV64V-NEXT: ntl.all
+; CHECK-RV64V-NEXT: vse64.v v8, (a0)
+; CHECK-RV64V-NEXT: ret
+;
+; CHECK-RV32V-LABEL: test_nontemporal_store_v2i64:
+; CHECK-RV32V: # %bb.0:
+; CHECK-RV32V-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-RV32V-NEXT: ntl.all
+; CHECK-RV32V-NEXT: vse64.v v8, (a0)
+; CHECK-RV32V-NEXT: ret
+ store <2 x i64> %v, ptr %p, !nontemporal !0
+ ret void
+}
+
+!0 = !{i32 1}
More information about the llvm-commits
mailing list