mindspore/patches/0027-reduce-memory-when-npu-compilation-with-cache.patch

From fcce2a2794417a6ff16148dbb751e402e476084a Mon Sep 17 00:00:00 2001
From: chengfeng27 <chengfeng27@huawei.com>
Date: Tue, 23 Jul 2024 10:46:59 +0800
Subject: [PATCH] fix memory leak

---
 .../core/mindrt/src/thread/core_affinity.cc   |   2 +-
 mindspore/lite/BUILD.gn                       |   5 +-
 mindspore/lite/src/common/mmap_utils.cc       |  14 +-
 mindspore/lite/src/common/mmap_utils.h        |   2 +-
 mindspore/lite/src/litert/cache_session.cc    | 425 ++++++++++++++++++
 mindspore/lite/src/litert/cache_session.h     | 129 ++++++
 .../src/litert/cxx_api/model/model_impl.cc    |  36 +-
 .../delegate/nnrt/extension_options_parser.cc |  12 +
 .../delegate/nnrt/extension_options_parser.h  |   2 +
 mindspore/lite/src/litert/lite_model.cc       |  12 +-
 mindspore/lite/src/litert/lite_model.h        |   2 +-
 mindspore/lite/src/litert/lite_session.h      |   6 +-
 12 files changed, 631 insertions(+), 16 deletions(-)
 create mode 100644 mindspore/lite/src/litert/cache_session.cc
 create mode 100644 mindspore/lite/src/litert/cache_session.h

diff --git a/mindspore/core/mindrt/src/thread/core_affinity.cc b/mindspore/core/mindrt/src/thread/core_affinity.cc
index 6886f743..6d13724f 100644
--- a/mindspore/core/mindrt/src/thread/core_affinity.cc
+++ b/mindspore/core/mindrt/src/thread/core_affinity.cc
@@ -217,7 +217,7 @@ int GetMaxFrequency(int core_id) {

 float CoreAffinity::GetServerFrequency() {
   float max_freq = -1.0f;
-#if defined(__APPLE__) || defined(__MACOSX) || defined(_MSC_VER) || defined(_WIN32)
+#if defined(__APPLE__) || defined(__MACOSX) || defined(_MSC_VER) || defined(_WIN32) || defined(MS_COMPILE_OHOS)
   return max_freq;  // MHz
 #else
   // The CPU cores in the server of the numa architecture are the same.
diff --git a/mindspore/lite/BUILD.gn b/mindspore/lite/BUILD.gn
index acee9733..d8ed3b44 100644
--- a/mindspore/lite/BUILD.gn
+++ b/mindspore/lite/BUILD.gn
@@ -438,7 +438,10 @@ ohos_shared_library("mindspore_lib") {
   if (SUPPORT_NNRT) {
     if (mindspore_feature_nnrt_metagraph) {
       defines += [ "SUPPORT_NNRT_METAGRAPH" ]
-      sources += [ "src/litert/delegate/nnrt/hiai_foundation_wrapper.cc", ]
+      sources += [
+        "src/litert/delegate/nnrt/hiai_foundation_wrapper.cc",
+        "src/litert/cache_session.cc",
+      ]
       print("enabled feature: mindspore_feature_nnrt_metagraph")
     }
     sources += [
diff --git a/mindspore/lite/src/common/mmap_utils.cc b/mindspore/lite/src/common/mmap_utils.cc
index ca8f8d1e..0dd31f7c 100644
--- a/mindspore/lite/src/common/mmap_utils.cc
+++ b/mindspore/lite/src/common/mmap_utils.cc
@@ -24,7 +24,7 @@

 namespace mindspore {
 namespace lite {
-void *ReadFileByMmap(const std::string &file, size_t *size) {
+void *ReadFileByMmap(const std::string &file, size_t *size, bool populate) {
 #if !defined(_WIN32) && !defined(_WIN64) && !defined(MS_COMPILE_IOS)
   auto real_path = RealPath(file.c_str());
   auto fd = open(real_path.c_str(), O_RDONLY);
@@ -39,7 +39,12 @@ void *ReadFileByMmap(const std::string &file, size_t *size) {
     return nullptr;
   }
   *size = fd_stat.st_size;
-  auto mmap_buffers = mmap(nullptr, *size, PROT_READ, MAP_SHARED | MAP_POPULATE, fd, 0);
+  void *mmap_buffers;
+  if (populate) {
+    mmap_buffers = mmap(nullptr, *size, PROT_READ, MAP_SHARED | MAP_POPULATE, fd, 0);
+  } else {
+    mmap_buffers = mmap(nullptr, *size, PROT_READ, MAP_SHARED, fd, 0);
+  }
   close(fd);
   if (mmap_buffers == MAP_FAILED) {
     MS_LOG(ERROR) << "Model mmap failed.";
@@ -54,7 +59,10 @@ void *ReadFileByMmap(const std::string &file, size_t *size) {

 void UnmapMmapBuffer(void *buffer, size_t size) {
 #if !defined(_WIN32) && !defined(_WIN64)
-  (void)munmap(buffer, size);
+  auto ret = munmap(buffer, size);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "munmap failed ret: " << ret << ", err: " << strerror(errno);
+  }
 #else
   MS_LOG(ERROR) << "Mmap is unsupported on windows.";
 #endif
diff --git a/mindspore/lite/src/common/mmap_utils.h b/mindspore/lite/src/common/mmap_utils.h
index bdd7c9a5..d3b0ec5f 100644
--- a/mindspore/lite/src/common/mmap_utils.h
+++ b/mindspore/lite/src/common/mmap_utils.h
@@ -20,7 +20,7 @@

 namespace mindspore {
 namespace lite {
-void *ReadFileByMmap(const std::string &file, size_t *size);
+void *ReadFileByMmap(const std::string &file, size_t *size, bool populate = true);
 void UnmapMmapBuffer(void *buffer, size_t size);
 }  // namespace lite
 }  // namespace mindspore
diff --git a/mindspore/lite/src/litert/cache_session.cc b/mindspore/lite/src/litert/cache_session.cc
new file mode 100644
index 00000000..7bafe3f7
--- /dev/null
+++ b/mindspore/lite/src/litert/cache_session.cc
@@ -0,0 +1,425 @@
+/**
+ * Copyright 2024 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cache_session.h"
+#include "src/common/context_util.h"
+#include "src/common/tensor_util.h"
+#include "src/common/mmap_utils.h"
+#include "src/common/file_utils.h"
+#include "src/litert/delegate/nnrt/nnrt_model_kernel.h"
+
+namespace mindspore {
+namespace lite {
+CacheSession::~CacheSession() {
+  if (nn_executor_ != nullptr) {
+    OH_NNExecutor_Destroy(&nn_executor_);
+    MS_LOG(INFO) << "Destroy NNExecutor Finish.";
+  }
+}
+
+int CacheSession::CompileGraph(Model *model) {
+  bool expected = false;
+  if (!is_running_.compare_exchange_strong(expected, true)) {
+    MS_LOG(ERROR) << "Not support multi-threading";
+    return RET_ERROR;
+  }
+  // Convert to abstract base model interface
+  auto ret = ConvertInOutTensors(model);
+  context_->set_schema_version(reinterpret_cast<LiteModel *>(model)->GetSchemaVersion());
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "ConvertTensors failed: " << ret;
+    is_running_.store(false);
+    return ret;
+  }
+  InitGraphInputTensors(model);
+  InitGraphOutputTensors(model);
+
+  // create NNRt kernel
+  ret = ScheduleToNNRTKernel();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Schedule NNRt kernel failed: " << ret;
+    is_running_.store(false);
+    return ret;
+  }
+
+  InitGraphInOutTensorsMap(model);
+  ret = PrepareKernels(model);
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Prepare kernels failed: " << ret;
+    is_running_.store(false);
+    return ret;
+  }
+
+  ret = InitExecutor();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "InitExecutor failed: " << ret;
+    is_running_.store(false);
+    return ret;
+  }
+
+  MarkSharedWeight(kernels_);
+  FreePackOpWeight(kernels_);
+
+  is_running_.store(false);
+  return RET_OK;
+}
+
+int CacheSession::InitExecutor() {
+  executor_ = new (std::nothrow) Executor();
+  if (executor_ == nullptr) {
+    MS_LOG(ERROR) << "New Executor failed";
+    return RET_ERROR;
+  }
+  auto ret = executor_->Prepare(kernels_, inputs_, outputs_, context_.get());
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Prepare executor failed: " << ret;
+    return ret;
+  }
+  return RET_OK;
+}
+
+int CacheSession::ConvertInOutTensors(const lite::Model *model) {
+  MS_ASSERT(model != nullptr);
+  auto lite_model = reinterpret_cast<const lite::LiteModel *>(model);
+  uint32_t tensor_count = model->graph_.all_tensors_.size();
+  auto model_input_indices = model->graph_.input_indices_;
+  auto model_output_indices = model->graph_.output_indices_;
+
+  for (uint32_t i = 0; i < tensor_count; ++i) {
+    auto *src_tensor = model->graph_.all_tensors_[i];
+    if (!IsContain(model_input_indices, i) && !IsContain(model_output_indices, i)) {
+      this->tensors_.emplace_back(nullptr);
+      continue;
+    }
+    if (src_tensor == nullptr) {
+      MS_LOG(ERROR) << i << "th tensor in model is nullptr";
+      return RET_NULL_PTR;
+    }
+    auto *dst_tensor = ConvertTensor(*src_tensor);
+    if (dst_tensor == nullptr) {
+      MS_LOG(ERROR) << "Convert new " << i << "th tensor failed!";
+      return RET_NULL_PTR;
+    }
+    auto ret = ConvertTensorsData(lite_model, i, dst_tensor);
+    if (ret != RET_OK) {
+      MS_LOG(ERROR) << "Convert data of " << i << "th tensor failed";
+      delete dst_tensor;
+      return ret;
+    }
+    ConvertTensorsQuantParam(src_tensor, dst_tensor);
+    if (IsContain(model_input_indices, i)) {
+      dst_tensor->set_category(Category::GRAPH_INPUT);
+    }
+    if (IsContain(model_output_indices, i)) {
+      // a tensor is as both input and output, would be treated as an input.
+      if (!dst_tensor->IsGraphInput()) {
+        dst_tensor->set_category(Category::GRAPH_OUTPUT);
+      }
+    }
+
+    ret = CheckTensorValid(dst_tensor);
+    if (ret != RET_OK) {
+      MS_LOG(ERROR) << "Check " << i << "th tensor failed";
+      delete dst_tensor;
+      return ret;
+    }
+
+    this->tensors_.emplace_back(dst_tensor);
+  }
+  return RET_OK;
+}
+
+int CacheSession::Init(const std::shared_ptr<InnerContext> &context) {
+  if (context == nullptr) {
+    MS_LOG(ERROR) << "context is nullptr";
+    return RET_NULL_PTR;
+  }
+  bool expected = false;
+  if (!is_running_.compare_exchange_strong(expected, true)) {
+    MS_LOG(ERROR) << "Not support multi-threading";
+    return RET_ERROR;
+  }
+  context_ = context;
+  auto ret = context_->Init();
+  if (ret != RET_OK) {
+    MS_LOG(ERROR) << "Init Context failed";
+    return ret;
+  }
+  ms_context_ = MSContextFromContext(context);
+  if (ms_context_ == nullptr) {
+    MS_LOG(ERROR) << "transfer context to ms context failed.";
+    return RET_NULL_PTR;
+  }
+
+  auto iter = std::find_if(context_->device_list_.begin(), context_->device_list_.end(),
+                           [](DeviceContext &device) { return device.device_type_ == lite::DT_NNRT; });
+  if(iter == context_->device_list_.end()) {
+    MS_LOG(ERROR) << "Found non NNRT device info";
+    return RET_ERROR;
+  }
+  nnrt_device_info_ = iter->device_info_.nnrt_device_info_;
+
+  const auto &extensions = nnrt_device_info_.extensions_;
+  mindspore::lite::nnrt::ExtensionOptionsParser::Parse(extensions, &extension_options_);
+
+  is_running_.store(false);
+  return RET_OK;
+}
+
+int CacheSession::ParseInputOutputFromModelBuffer(const char *model_buf, LiteModel *model) {
+  const void *meta_graph = nullptr;
+  meta_graph = reinterpret_cast<const void *>(schema::GetMetaGraph(model_buf));
+  assert(meta_graph != nullptr);
+
+  auto status = GenerateModelInputOutput<schema::MetaGraph, schema::CNode>(
+    *reinterpret_cast<const schema::MetaGraph *>(meta_graph), model->graph_);
+  if (status != RET_OK) {
+    MS_LOG(ERROR) << "fail to generate model";
+    return status;
+  }
+  model->buf = const_cast<char *>(model_buf);
+  return RET_OK;
+}
+
+int CacheSession::LoadModelAndCompileByPath(const std::string &model_path, mindspore::ModelType model_type) {
+  size_t model_size;
+  bool use_mmap = IsMmapEnable();
+  auto model_buf = LoadModelByPath(model_path, model_type, &model_size, use_mmap);
+  if (model_buf == nullptr) {
+    MS_LOG(ERROR) << "Read model file failed";
+    return RET_ERROR;
+  }
+
+  Model *model = nullptr;
+  if (extension_options_.cache_path_.empty()) {
+    MS_LOG(ERROR) << "cache path is empty";
+    return RET_ERROR;
+  } else {
+    model = ImportInOutFromBuffer(model_buf, model_size, true, model_type, model_path);
+    dynamic_cast<LiteModel *>(model)->PrepareInnerTensors();
+  }
+  if (model == nullptr) {
+    MS_LOG(ERROR) << "Import model failed";
+    return RET_ERROR;
+  }
+
+  if (use_mmap) {
+    reinterpret_cast<lite::LiteModel *>(model)->model_buf_by_mmap_ = true;
+  } else {
+    MS_LOG(WARNING) << "Memory may exceed the limit of business demands.";
+  }
+  (reinterpret_cast<lite::LiteModel *>(model))->set_keep_model_buf(true);
+  auto ret = CompileGraph(model);
+  if (ret != lite::RET_OK) {
+    MS_LOG(ERROR) << "Compile model failed";
+    model->buf = nullptr;
+    delete model;
+    return RET_ERROR;
+  }
+  set_model(model);
+  return RET_OK;
+}
+
+Model *CacheSession::ImportInOutFromBuffer(const char *model_buf, size_t size, bool take_buf, mindspore::ModelType model_type,
+                               const std::string &path) {
+  MS_LOG(INFO) << "import model from lite model";
+  auto *model = new (std::nothrow) LiteModel(path);
+  if (model == nullptr) {
+    MS_LOG(ERROR) << "new model fail!";
+    return nullptr;
+  }
+
+  auto status = ParseInputOutputFromModelBuffer(model_buf, model);
+  if (status != RET_OK) {
+    MS_LOG(ERROR) << "construct model failed.";
+    delete model;
+    return nullptr;
+  }
+  model->buf = const_cast<char *>(model_buf);
+  model->buf_size_ = size;
+  return model;
+}
+
+int CacheSession::ScheduleToNNRTKernel() {
+  if (!IsKirinNPUWithOnlineInference(nnrt_device_info_.device_id_)) {
+    MS_LOG(ERROR) << "only support NPU_ device.";
+    return RET_ERROR;
+  }
+  auto ret = CreateFullModelKernel();
+  if (ret != kSuccess) {
+    MS_LOG(ERROR) << "Build npu model failed.";
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+bool CacheSession::IsKirinNPUWithOnlineInference(size_t device_id) {
+  const std::string kirin_npu_name_prefix = "NPU_";
+  const char *device_name;
+  auto ret = OH_NNDevice_GetName(device_id, &device_name);
+  if (ret != OH_NN_SUCCESS) {
+    MS_LOG(WARNING) << "Get name of device: " << device_id << " failed, error: " << ret;
+    return false;
+  }
+
+  if (strncmp(kirin_npu_name_prefix.c_str(), device_name, kirin_npu_name_prefix.size()) != 0) {
+    MS_LOG(WARNING) << "strncmp: " << device_id << " failed, device_name: " << device_name;
+    return false;
+  }
+  return true;
+}
+
+Status CacheSession::CreateFullModelKernel() {
+  OH_NNCompilation* nn_compilation = OH_NNCompilation_ConstructForCache();
+  if (nn_compilation == nullptr) {
+    MS_LOG(ERROR) << "Construct NNCompilation failed";
+    return kLiteError;
+  }
+  MS_LOG(DEBUG) << "NNRTDelegate creates NNCompilation success.";
+
+  auto ret_code = InitNNCompilation(nn_compilation);
+  if (ret_code != kSuccess) {
+    MS_LOG(ERROR) << "Init NNCompilation failed";
+    OH_NNCompilation_Destroy(&nn_compilation);
+    return kLiteError;
+  }
+
+  OH_NNExecutor *nn_executor = nullptr;
+  nn_executor = OH_NNExecutor_Construct(nn_compilation);
+  if (nn_executor == nullptr) {
+    MS_LOG(ERROR) << "Construct NNExecutor failed, ret: " << ret_code;
+    OH_NNCompilation_Destroy(&nn_compilation);
+    return kLiteError;
+  }
+  OH_NNCompilation_Destroy(&nn_compilation);
+
+  ms_inputs_ = LiteTensorsToMSTensors(inputs_);
+  ms_outputs_ = LiteTensorsToMSTensors(outputs_);
+  auto nnrt_model_kernel = new (std::nothrow) NNRTModelKernel(nn_executor, nnrt_device_info_, ms_inputs_, ms_outputs_);
+  if (nnrt_model_kernel == nullptr) {
+    OH_NNExecutor_Destroy(&nn_executor);
+    MS_LOG(ERROR) << "new NNRTModelKernel failed";
+    return kLiteError;
+  }
+  nn_executor_ = nn_executor;
+
+  std::shared_ptr<kernel::Kernel> shared_kernel(nnrt_model_kernel);
+  auto *kernel_exec = new (std::nothrow) kernel::KernelExec(shared_kernel);
+  if (kernel_exec == nullptr) {
+    MS_LOG(ERROR) << "nnrt kernel exec create failed.";
+    return kLiteError;
+  }
+  auto delegate_type = kNumberTypeFloat32;
+  for (auto &input : nnrt_model_kernel->inputs()) {
+    if (static_cast<TypeId>(input.DataType()) == kNumberTypeFloat16) {
+      delegate_type = kNumberTypeFloat16;
+      break;
+    }
+  }
+  kernel::KernelKey delegate_desc{kernel::kDelegate, delegate_type, NHWC, schema::PrimitiveType_NONE, "", ""};
+  kernel_exec->set_desc(delegate_desc);
+  kernel_exec->set_context(context_.get());
+  kernels_.push_back(kernel_exec);
+
+  return kSuccess;
+}
+
+Status CacheSession::InitNNCompilation(OH_NNCompilation *nn_compilation) const {
+  auto ret_code = OH_NNCompilation_SetDevice(nn_compilation, nnrt_device_info_.device_id_);
+  if (ret_code != OH_NN_SUCCESS) {
+    MS_LOG(ERROR) << "NNCompilation set device id failed, ret: " << ret_code;
+    return kLiteError;
+  }
+  ret_code = OH_NNCompilation_SetPerformanceMode(nn_compilation,
+                                                 (OH_NN_PerformanceMode)(nnrt_device_info_.performance_mode_));
+  if ((ret_code != OH_NN_SUCCESS) && (ret_code != OH_NN_OPERATION_FORBIDDEN)) {
+    MS_LOG(ERROR) << "NNCompilation set performance mode failed, ret: " << ret_code;
+    return kLiteError;
+  }
+  ret_code = OH_NNCompilation_SetPriority(nn_compilation, (OH_NN_Priority)(nnrt_device_info_.priority_));
+  if ((ret_code != OH_NN_SUCCESS) && (ret_code != OH_NN_OPERATION_FORBIDDEN)) {
+    MS_LOG(ERROR) << "NNCompilation set priority failed, ret: " << ret_code;
+    return kLiteError;
+  }
+  ret_code = OH_NNCompilation_EnableFloat16(nn_compilation, nnrt_device_info_.enable_fp16_);
+  if ((ret_code != OH_NN_SUCCESS) && (ret_code != OH_NN_OPERATION_FORBIDDEN)) {
+    MS_LOG(ERROR) << "NNCompilation enable fp16 failed, ret: " << ret_code;
+    return kLiteError;
+  }
+
+  if (!extension_options_.cache_path_.empty()) {
+    ret_code = OH_NNCompilation_SetCache(nn_compilation, extension_options_.cache_path_.c_str(),
+                                         extension_options_.cache_version_);
+    if ((ret_code != OH_NN_SUCCESS) && (ret_code != OH_NN_OPERATION_FORBIDDEN)) {
+      MS_LOG(ERROR) << "NNCompilation set cache failed, ret: " << ret_code;
+      return kLiteError;
+    }
+  } else {
+    MS_LOG(ERROR) << "NNCompilation must set Cache.";
+    return kLiteError;
+  }
+
+  size_t extension_size = nnrt_device_info_.extensions_.size();
+  for (size_t i = 0; i < extension_size; i++) {
+    auto &src_extensoin = nnrt_device_info_.extensions_[i];
+    ret_code = OH_NNCompilation_AddExtensionConfig(nn_compilation, src_extensoin.name.c_str(),
+                                                   (char *)((void *)src_extensoin.value.data()),
+                                                   src_extensoin.value.size());
+    if (ret_code != OH_NN_SUCCESS) {
+      MS_LOG(ERROR) << "OH_NNCompilation_AddExtensionConfig " << i << ": "<< src_extensoin.name << " failed, ret: "
+                    << ret_code;
+      return kLiteError;
+    }
+  }
+
+  ret_code = OH_NNCompilation_Build(nn_compilation);
+  if (ret_code != OH_NN_SUCCESS) {
+    MS_LOG(ERROR) << "Build NNCompilation failed, ret: " << ret_code;
+    return kLiteError;
+  }
+  return kSuccess;
+}
+
+const char *CacheSession::LoadModelByPath(const std::string &file, mindspore::ModelType model_type, size_t *size, bool use_mmap) {
+  size_t buf_size;
+  char *model_buf;
+  if (use_mmap) {
+    model_buf = reinterpret_cast<char *>(lite::ReadFileByMmap(file.c_str(), &buf_size, false));
+  } else {
+    MS_LOG(WARNING) << "Memory may exceed the limit of business demands.";
+    model_buf = lite::ReadFile(file.c_str(), &buf_size);
+  }
+  if (model_buf == nullptr) {
+    MS_LOG(ERROR) << "The model path is invalid";
+    return model_buf;
+  }
+
+  char *lite_buf = nullptr;
+  auto buf_model_type = LoadModelByBuff(model_buf, buf_size, &lite_buf, size, model_type);
+  if (buf_model_type == mindspore::ModelType::kUnknownType || lite_buf == nullptr) {
+    if (use_mmap) {
+      lite::UnmapMmapBuffer(const_cast<void *>(static_cast<const void *>(model_buf)), buf_size);
+    } else {
+      delete[] model_buf;
+    }
+    model_buf = nullptr;
+    return nullptr;
+  }
+
+  return lite_buf;
+}
+}  // namespace lite
+}  // namespace mindspore
diff --git a/mindspore/lite/src/litert/cache_session.h b/mindspore/lite/src/litert/cache_session.h
new file mode 100644
index 00000000..f0ae185a
--- /dev/null
+++ b/mindspore/lite/src/litert/cache_session.h
@@ -0,0 +1,129 @@
+/**
+ * Copyright 2024 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_LITERT_CACHE_SESSION_H_
+#define MINDSPORE_LITE_SRC_LITERT_CACHE_SESSION_H_
+
+#include "src/litert/lite_session.h"
+#include "src/litert/inner_context.h"
+#include "src/litert/lite_model.h"
+#include "src/litert/delegate/nnrt/extension_options_parser.h"
+#include "interfaces/kits/c/neural_network_runtime/neural_network_runtime_type.h"
+#include "interfaces/kits/c/neural_network_runtime/neural_network_runtime.h"
+#include "interfaces/innerkits/c/neural_network_runtime_inner.h"
+
+namespace mindspore {
+namespace lite {
+class CacheSession : public LiteSession {
+ public:
+  CacheSession() = default;
+  ~CacheSession() override;
+  int Init(const std::shared_ptr<InnerContext> &context) override;
+  int CompileGraph(Model *model) override;
+  int LoadModelAndCompileByPath(const std::string &model_path, mindspore::ModelType model_type) override;
+  static bool IsKirinNPUWithOnlineInference(size_t device_id);
+  const char *LoadModelByPath(const std::string &file, mindspore::ModelType model_type, size_t *size,
+                              bool use_mmap) override;
+  Model* ImportInOutFromBuffer(const char *model_buf, size_t size, bool take_buf,
+                               mindspore::ModelType model_type = mindspore::ModelType::kMindIR_Lite,
+                               const std::string &path = "");
+
+  template <typename T = schema::MetaGraph>
+  bool ConvertInputOutputTensors(const T &meta_graph, LiteGraph &graph_) {
+    if (meta_graph.allTensors() == nullptr) {
+      MS_LOG(ERROR) << "meta_graph is invalid, please check your model file.";
+      return false;
+    }
+
+    graph_.all_tensors_.resize(meta_graph.allTensors()->size());
+    MS_LOG(INFO) << "convert input/output tensors";
+    for (auto i: graph_.input_indices_) {
+      auto *tensor = meta_graph.allTensors()->template GetAs<schema::Tensor>(i);
+      if (tensor == nullptr) {
+        MS_LOG(ERROR) << i << " the input tensor in metagraph is nullptr";
+        return false;
+      }
+      MS_CHECK_TRUE_RET(tensor->format() >= schema::Format_MIN && tensor->format() <= schema::Format_MAX, false);
+      graph_.all_tensors_[i] = (const_cast<mindspore::schema::Tensor *>(tensor));
+    }
+
+    for (auto i: graph_.output_indices_) {
+      auto *tensor = meta_graph.allTensors()->template GetAs<schema::Tensor>(i);
+      if (tensor == nullptr) {
+        MS_LOG(ERROR) << i << " the output tensor in metagraph is nullptr";
+      }
+      MS_CHECK_TRUE_RET(tensor->format() >= schema::Format_MIN && tensor->format() <= schema::Format_MAX, false);
+      graph_.all_tensors_[i] = (const_cast<mindspore::schema::Tensor *>(tensor));
+    }
+    return true;
+  }
+
+  template <typename T = schema::MetaGraph, typename U = schema::CNode>
+  int GenerateModelInputOutput(const T &meta_graph, LiteGraph &graph_) {
+    if (meta_graph.name() != nullptr) {
+      graph_.name_ = meta_graph.name()->c_str();
+    }
+    if (meta_graph.version() != nullptr) {
+      graph_.version_ = meta_graph.version()->c_str();
+    }
+
+    if (meta_graph.inputIndex() == nullptr || meta_graph.outputIndex() == nullptr ||
+        meta_graph.allTensors() == nullptr) {
+      MS_LOG(ERROR) << "meta_graph is invalid, please check your model file.";
+      return RET_ERROR;
+    }
+
+    // converterInputOutput
+    auto in_count = meta_graph.inputIndex()->size();
+    for (uint32_t i = 0; i < in_count; ++i) {
+      graph_.input_indices_.push_back(meta_graph.inputIndex()->Get(i));
+    }
+    auto out_count = meta_graph.outputIndex()->size();
+    for (uint32_t i = 0; i < out_count; ++i) {
+      graph_.output_indices_.push_back(meta_graph.outputIndex()->Get(i));
+    }
+
+    if (!ConvertInputOutputTensors<T>(meta_graph, graph_)) {
+      MS_LOG(ERROR) << "convert tensor failed";
+      return RET_ERROR;
+    }
+    return RET_OK;
+  }
+
+  int ParseInputOutputFromModelBuffer(const char *model_buf, LiteModel *model);
+  int BindGLTexture2DMemory(const std::map<std::string, unsigned int> &inputGLTexture,
+                            std::map<std::string, unsigned int> *outputGLTexture) override {
+    return RET_ERROR;
+  }
+
+ protected:
+  int ScheduleToNNRTKernel();
+  Status CreateFullModelKernel();
+  Status InitNNCompilation(OH_NNCompilation *nn_compilation) const;
+  int ConvertInOutTensors(const lite::Model *model);
+  int InitExecutor() override;
+  std::vector<mindspore::MSTensor> ms_inputs_;
+  std::vector<mindspore::MSTensor> ms_outputs_;
+
+ private:
+  NNRtDeviceInfo nnrt_device_info_;
+  OH_NNExecutor *nn_executor_{nullptr};
+  nnrt::ExtensionOptions extension_options_;
+};
+}  // namespace lite
+}  // namespace mindspore
+
+#endif  // MINDSPORE_LITE_SRC_LITERT_CACHE_SESSION_H_
diff --git a/mindspore/lite/src/litert/cxx_api/model/model_impl.cc b/mindspore/lite/src/litert/cxx_api/model/model_impl.cc
index 02533dc3..cacbf86e 100644
--- a/mindspore/lite/src/litert/cxx_api/model/model_impl.cc
+++ b/mindspore/lite/src/litert/cxx_api/model/model_impl.cc
@@ -39,6 +39,11 @@
 #include "src/common/config_file.h"
 #include "src/litert/cpu_info.h"
 #include "src/litert/pack_weight_manager.h"
+#ifdef SUPPORT_NNRT_METAGRAPH
+#include "src/litert/cache_session.h"
+#include "src/litert/delegate/nnrt/extension_options_parser.h"
+#endif
+
 namespace mindspore {
 namespace {
 const char *const kExecutionPlan = "execution_plan";
@@ -1006,7 +1011,36 @@ float ModelImpl::GetLearningRate() {
 }

 lite::LiteSession *ModelImpl::CreateLiteSession(const std::shared_ptr<lite::InnerContext> &context) {
-  auto session = new (std::nothrow) lite::LiteSession();
+  if (context == nullptr) {
+    MS_LOG(ERROR) << "context is nullptr";
+    return nullptr;
+  }
+  lite::LiteSession *session = nullptr;
+#ifdef SUPPORT_NNRT_METAGRAPH
+  auto iter = std::find_if(context->device_list_.begin(), context->device_list_.end(),
+                           [](lite::DeviceContext &device) { return device.device_type_ == lite::DT_NNRT; });
+  if(iter != context->device_list_.end()) {
+    const auto &nnrt_device_info = iter->device_info_.nnrt_device_info_;
+    if (lite::CacheSession::IsKirinNPUWithOnlineInference(nnrt_device_info.device_id_)) {
+      const auto &extensions = nnrt_device_info.extensions_;
+      lite::nnrt::ExtensionOptions extension_options;
+      mindspore::lite::nnrt::ExtensionOptionsParser::Parse(extensions, &extension_options);
+      auto has_cache = OH_NNModel_HasCache(extension_options.cache_path_.c_str(), extension_options.model_name.c_str(),
+                                           extension_options.cache_version_);
+      if (has_cache) {
+        session = reinterpret_cast<lite::LiteSession *>(new (std::nothrow) lite::CacheSession());
+        if (session == nullptr) {
+          MS_LOG(ERROR) << "create cache session failed";
+          return nullptr;
+        }
+      }
+    }
+  }
+#endif
+
+  if (session == nullptr) {
+    session = new (std::nothrow) lite::LiteSession();
+  }
   if (session == nullptr) {
     MS_LOG(ERROR) << "create session failed";
     return nullptr;
diff --git a/mindspore/lite/src/litert/delegate/nnrt/extension_options_parser.cc b/mindspore/lite/src/litert/delegate/nnrt/extension_options_parser.cc
index e35cc2a5..a66cd5ea 100644
--- a/mindspore/lite/src/litert/delegate/nnrt/extension_options_parser.cc
+++ b/mindspore/lite/src/litert/delegate/nnrt/extension_options_parser.cc
@@ -30,6 +30,7 @@ const std::string kCachePath = "CachePath";
 const std::string kCacheVersion = "CacheVersion";
 const std::string kBandMode = "BandMode";
 const std::string kQuantConfigData = "QuantConfigData";
+const std::string kModelName = "ModelName";
 }  // namespace

 int ExtensionOptionsParser::Parse(const std::vector<Extension> &extensions, ExtensionOptions *param) {
@@ -39,6 +40,7 @@ int ExtensionOptionsParser::Parse(const std::vector<Extension> &extensions, Exte
   DoParseCacheVersion(extensions, &param->cache_version_);
   DoParseBondMode(extensions, &param->band_mode);
   DoParseQuantConfig(extensions, &param->quant_config, &param->quant_config_size, &param->is_optional_quant_setted);
+  DoParseModelName(extensions, &param->model_name);
   return RET_OK;
 }

@@ -89,4 +91,14 @@ void ExtensionOptionsParser::DoParseQuantConfig(const std::vector<Extension> &ex
     *quant_setted = true;
   }
 }
+
+void ExtensionOptionsParser::DoParseModelName(const std::vector<Extension> &extensions, std::string *model_name) {
+  MS_CHECK_TRUE_RET_VOID(model_name != nullptr);
+  auto iter_config = std::find_if(extensions.begin(), extensions.end(), [](const Extension &extension) {
+    return extension.name == kModelName;
+  });
+  if (iter_config != extensions.end()) {
+    *model_name = std::string(iter_config->value.begin(), iter_config->value.end());
+  }
+}
 }  // mindspore::lite::nnrt
\ No newline at end of file
diff --git a/mindspore/lite/src/litert/delegate/nnrt/extension_options_parser.h b/mindspore/lite/src/litert/delegate/nnrt/extension_options_parser.h
index f24682ce..9a030ad6 100644
--- a/mindspore/lite/src/litert/delegate/nnrt/extension_options_parser.h
+++ b/mindspore/lite/src/litert/delegate/nnrt/extension_options_parser.h
@@ -29,6 +29,7 @@ struct ExtensionOptions {
   void *quant_config;
   size_t quant_config_size = 0;
   bool is_optional_quant_setted = false;
+  std::string model_name = "";
 };

 class ExtensionOptionsParser {
@@ -41,6 +42,7 @@ private:
                                  bool *quant_setted);
   static void DoParseCachePath(const std::vector<Extension> &extensions, std::string *cache_path);
   static void DoParseCacheVersion(const std::vector<Extension> &extensions, uint32_t *cache_version);
+  static void DoParseModelName(const std::vector<Extension> &extensions, std::string *model_name);
 };

 }  // namespace mindspore::lite::nnrt
diff --git a/mindspore/lite/src/litert/lite_model.cc b/mindspore/lite/src/litert/lite_model.cc
index 006bc02c..5acf5760 100644
--- a/mindspore/lite/src/litert/lite_model.cc
+++ b/mindspore/lite/src/litert/lite_model.cc
@@ -538,14 +538,16 @@ bool LiteModel::PrepareInnerTensors() {
       MS_LOG(ERROR) << "Create SchemaTensorWrapper return nullptr";
       return false;
     }
+    if (graph_.all_tensors_.at(i) != nullptr) {
 #ifdef ENABLE_LITE_HELPER
-    if (!tensor_wrapper->Init(*(graph_.all_tensors_.at(i)), static_cast<SCHEMA_VERSION>(schema_version_), dir,
-                              infer_helpers)) {
+      if (!tensor_wrapper->Init(*(graph_.all_tensors_.at(i)), static_cast<SCHEMA_VERSION>(schema_version_), dir,
+                                infer_helpers)) {
 #else
-    if (!tensor_wrapper->Init(*(graph_.all_tensors_.at(i)), static_cast<SCHEMA_VERSION>(schema_version_), dir)) {
+      if (!tensor_wrapper->Init(*(graph_.all_tensors_.at(i)), static_cast<SCHEMA_VERSION>(schema_version_), dir)) {
 #endif
-      delete tensor_wrapper;
-      return false;
+        delete tensor_wrapper;
+        return false;
+      }
     }
     this->inner_all_tensors_[i] = tensor_wrapper;
   }
diff --git a/mindspore/lite/src/litert/lite_model.h b/mindspore/lite/src/litert/lite_model.h
index 647746a2..c0847c1e 100644
--- a/mindspore/lite/src/litert/lite_model.h
+++ b/mindspore/lite/src/litert/lite_model.h
@@ -66,13 +66,13 @@ class MS_API LiteModel : public Model {

   static int VersionVerify(flatbuffers::Verifier *verify);

- private:
 #ifdef ENABLE_LITE_HELPER
   bool PrepareInnerTensors(mindspore::infer::helper::InferHelpers *infer_helpers = nullptr);
 #else
   bool PrepareInnerTensors();
 #endif

+ private:
   bool CheckQuantAllInit(const flatbuffers::Vector<flatbuffers::Offset<mindspore::schema::QuantParam>> *quant_params);

   template <typename T = schema::MetaGraph, typename U = schema::CNode>
diff --git a/mindspore/lite/src/litert/lite_session.h b/mindspore/lite/src/litert/lite_session.h
index 64a5f6d3..487b382a 100644
--- a/mindspore/lite/src/litert/lite_session.h
+++ b/mindspore/lite/src/litert/lite_session.h
@@ -57,10 +57,10 @@ class MS_API LiteSession {
 #else
   int LoadModelAndCompileByBuf(const char *model_buf, mindspore::ModelType model_type, const size_t &buf_size);
 #endif
-  int LoadModelAndCompileByPath(const std::string &model_path, mindspore::ModelType model_type);
+  virtual int LoadModelAndCompileByPath(const std::string &model_path, mindspore::ModelType model_type);
   mindspore::ModelType LoadModelByBuff(const char *model_buf, const size_t &buf_size, char **lite_buf, size_t *size,
                                        mindspore::ModelType model_type);
-  const char *LoadModelByPath(const std::string &file, mindspore::ModelType model_type, size_t *size, bool use_mmap);
+  virtual const char *LoadModelByPath(const std::string &file, mindspore::ModelType model_type, size_t *size, bool use_mmap);
   virtual int Init(const std::shared_ptr<InnerContext> &context);
   virtual void BindThread(bool if_bind);
   virtual int CompileGraph(Model *model);
@@ -168,10 +168,10 @@ class MS_API LiteSession {
   static void MarkSharedWeight(const std::vector<kernel::KernelExec *> &kernels);
   std::string ParseWeightPath();
   bool IsMmapEnable();
+  virtual int InitExecutor();

  private:
   int PreCheck(Model *model);
-  int InitExecutor();
   void ResetInputsShape(const std::vector<std::vector<int>> &dims);
   int ContextInit(const std::shared_ptr<InnerContext> &context);
   int CreateTensorRTDelegate();
--
2.17.1