1// Copyright (C) 2018 Intel Corporation
2//
3// Permission is hereby granted, free of charge, to any person obtaining a copy
4// of this software and associated documentation files (the "Software"),
5// to deal in the Software without restriction, including without limitation
6// the rights to use, copy, modify, merge, publish, distribute, sublicense,
7// and/or sell copies of the Software, and to permit persons to whom
8// the Software is furnished to do so, subject to the following conditions:
9//
10// The above copyright notice and this permission notice shall be included
11// in all copies or substantial portions of the Software.
12//
13// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
14// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
16// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES
17// OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
18// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
19// OR OTHER DEALINGS IN THE SOFTWARE.
20//
21// SPDX-License-Identifier: MIT
22
23// The functions in this file map the .text section of Node.js into 2MB pages.
24// They perform the following steps:
25//
26// 1: Find the Node.js binary's `.text` section in memory. This is done below in
27//    `FindNodeTextRegion`. It is accomplished in a platform-specific way. On
28//    Linux and FreeBSD, `dl_iterate_phdr(3)` is used. When the region is found,
29//    it is "trimmed" as follows:
30//    * Modify the start to point to the very beginning of the Node.js `.text`
31//      section (from symbol `__node_text_start` declared in node_text_start.S).
32//    * Possibly modify the end to account for the `lpstub` section which
33//      contains `MoveTextRegionToLargePages`, the function we do not wish to
34//      move (see below).
35//    * Align the address of the start to its nearest higher large page
36//      boundary.
37//    * Align the address of the end to its nearest lower large page boundary.
38//
39// 2: Move the text region to large pages. This is done below in
40//    `MoveTextRegionToLargePages`. We need to be very careful:
41//    a) `MoveTextRegionToLargePages` itself should not be moved.
42//       We use gcc attributes
43//       (__section__) to put it outside the `.text` section,
44//       (__aligned__) to align it at the 2M boundary, and
45//       (__noline__) to not inline this function.
46//    b) `MoveTextRegionToLargePages` should not call any function(s) that might
47//       be moved.
48//    To move the .text section, perform the following steps:
49//      * Map a new, temporary area and copy the original code there.
50//      * Use mmap using the start address with MAP_FIXED so we get exactly the
51//        same virtual address (except on OSX). On platforms other than Linux,
52//        use mmap flags to request hugepages.
53//      * On Linux use madvise with MADV_HUGEPAGE to use anonymous 2MB pages.
54//      * If successful copy the code to the newly mapped area and protect it to
55//        be readable and executable.
56//      * Unmap the temporary area.
57
58#include "node_large_page.h"
59
60#include <cerrno>   // NOLINT(build/include)
61
62// Besides returning ENOTSUP at runtime we do nothing if this define is missing.
63#if defined(NODE_ENABLE_LARGE_CODE_PAGES) && NODE_ENABLE_LARGE_CODE_PAGES
64#include "debug_utils-inl.h"
65
66#if defined(__linux__) || defined(__FreeBSD__)
67#if defined(__linux__)
68#ifndef _GNU_SOURCE
69#define _GNU_SOURCE
70#endif  // ifndef _GNU_SOURCE
71#include <sys/prctl.h>
72#if !defined(PR_SET_VMA)
73#define PR_SET_VMA 0x53564d41
74#define PR_SET_VMA_ANON_NAME 0
75#endif
76#elif defined(__FreeBSD__)
77#include "uv.h"  // uv_exepath
78#endif  // defined(__linux__)
79#include <link.h>
80#endif  // defined(__linux__) || defined(__FreeBSD__)
81
82#include <sys/types.h>
83#include <sys/mman.h>
84#if defined(__FreeBSD__)
85#include <sys/sysctl.h>
86#elif defined(__APPLE__)
87#include <mach/vm_map.h>
88#endif
89
90#include <climits>  // PATH_MAX
91#include <cstdlib>
92#include <cstdint>
93#include <cstring>
94#include <string>
95#include <fstream>
96
97#if defined(__linux__) || defined(__FreeBSD__)
98extern "C" {
99// This symbol must be declared weak because this file becomes part of all
100// Node.js targets (like node_mksnapshot, node_mkcodecache, and cctest) and
101// those files do not supply the symbol.
102extern char __attribute__((weak)) __node_text_start;
103extern char __start_lpstub;
104}  // extern "C"
105#endif  // defined(__linux__) || defined(__FreeBSD__)
106
107#endif  // defined(NODE_ENABLE_LARGE_CODE_PAGES) && NODE_ENABLE_LARGE_CODE_PAGES
108namespace node {
109#if defined(NODE_ENABLE_LARGE_CODE_PAGES) && NODE_ENABLE_LARGE_CODE_PAGES
110
111namespace {
112
113struct text_region {
114  char* from = nullptr;
115  char* to = nullptr;
116  bool found_text_region = false;
117};
118
119static const size_t hps = 2L * 1024 * 1024;
120
121template <typename... Args>
122inline void Debug(std::string fmt, Args&&... args) {
123  node::Debug(&per_process::enabled_debug_list,
124              DebugCategory::HUGEPAGES,
125              (std::string("Hugepages info: ") + fmt).c_str(),
126              std::forward<Args>(args)...);
127}
128
129inline void PrintWarning(const char* warn) {
130  fprintf(stderr, "Hugepages WARNING: %s\n", warn);
131}
132
133inline void PrintSystemError(int error) {
134  PrintWarning(strerror(error));
135}
136
137inline uintptr_t hugepage_align_up(uintptr_t addr) {
138  return (((addr) + (hps) - 1) & ~((hps) - 1));
139}
140
141inline uintptr_t hugepage_align_down(uintptr_t addr) {
142  return ((addr) & ~((hps) - 1));
143}
144
145#if defined(__linux__) || defined(__FreeBSD__)
146#if defined(__FreeBSD__)
147#ifndef ElfW
148#define ElfW(name) Elf_##name
149#endif  // ifndef ElfW
150#endif  // defined(__FreeBSD__)
151
152struct dl_iterate_params {
153  uintptr_t start = 0;
154  uintptr_t end = 0;
155  uintptr_t reference_sym = reinterpret_cast<uintptr_t>(&__node_text_start);
156  std::string exename;
157};
158
159int FindMapping(struct dl_phdr_info* info, size_t, void* data) {
160  auto dl_params = static_cast<dl_iterate_params*>(data);
161  if (dl_params->exename == std::string(info->dlpi_name)) {
162    for (int idx = 0; idx < info->dlpi_phnum; idx++) {
163      const ElfW(Phdr)* phdr = &info->dlpi_phdr[idx];
164      if (phdr->p_type == PT_LOAD && (phdr->p_flags & PF_X)) {
165        uintptr_t start = info->dlpi_addr + phdr->p_vaddr;
166        uintptr_t end = start + phdr->p_memsz;
167
168        if (dl_params->reference_sym >= start &&
169            dl_params->reference_sym <= end) {
170          dl_params->start = start;
171          dl_params->end = end;
172          return 1;
173        }
174      }
175    }
176  }
177  return 0;
178}
179#endif  // defined(__linux__) || defined(__FreeBSD__)
180
181struct text_region FindNodeTextRegion() {
182  struct text_region nregion;
183#if defined(__linux__) || defined(__FreeBSD__)
184  dl_iterate_params dl_params;
185  uintptr_t lpstub_start = reinterpret_cast<uintptr_t>(&__start_lpstub);
186
187#if defined(__FreeBSD__)
188  // On FreeBSD we need the name of the binary, because `dl_iterate_phdr` does
189  // not pass in an empty string as the `dlpi_name` of the binary but rather its
190  // absolute path.
191  {
192    char selfexe[PATH_MAX];
193    size_t count = sizeof(selfexe);
194    if (uv_exepath(selfexe, &count))
195      return nregion;
196    dl_params.exename = std::string(selfexe, count);
197  }
198#endif  // defined(__FreeBSD__)
199
200  if (dl_iterate_phdr(FindMapping, &dl_params) == 1) {
201    Debug("start: %p - sym: %p - end: %p\n",
202          reinterpret_cast<void*>(dl_params.start),
203          reinterpret_cast<void*>(dl_params.reference_sym),
204          reinterpret_cast<void*>(dl_params.end));
205
206    dl_params.start = dl_params.reference_sym;
207    if (lpstub_start > dl_params.start && lpstub_start <= dl_params.end) {
208      Debug("Trimming end for lpstub: %p\n",
209            reinterpret_cast<void*>(lpstub_start));
210      dl_params.end = lpstub_start;
211    }
212
213    if (dl_params.start < dl_params.end) {
214      char* from = reinterpret_cast<char*>(hugepage_align_up(dl_params.start));
215      char* to = reinterpret_cast<char*>(hugepage_align_down(dl_params.end));
216      Debug("Aligned range is %p - %p\n", from, to);
217      if (from < to) {
218        size_t pagecount = (to - from) / hps;
219        if (pagecount > 0) {
220          nregion.found_text_region = true;
221          nregion.from = from;
222          nregion.to = to;
223        }
224      }
225    }
226  }
227#elif defined(__APPLE__)
228  struct vm_region_submap_info_64 map;
229  mach_msg_type_number_t count = VM_REGION_SUBMAP_INFO_COUNT_64;
230  vm_address_t addr = 0UL;
231  vm_size_t size = 0;
232  natural_t depth = 1;
233
234  while (true) {
235    if (vm_region_recurse_64(mach_task_self(), &addr, &size, &depth,
236                             reinterpret_cast<vm_region_info_64_t>(&map),
237                             &count) != KERN_SUCCESS) {
238      break;
239    }
240
241    if (map.is_submap) {
242      depth++;
243    } else {
244      char* start = reinterpret_cast<char*>(hugepage_align_up(addr));
245      char* end = reinterpret_cast<char*>(hugepage_align_down(addr+size));
246
247      if (end > start && (map.protection & VM_PROT_READ) != 0 &&
248          (map.protection & VM_PROT_EXECUTE) != 0) {
249        nregion.found_text_region = true;
250        nregion.from = start;
251        nregion.to = end;
252        break;
253      }
254
255      addr += size;
256      size = 0;
257    }
258  }
259#endif
260  Debug("Found %d huge pages\n", (nregion.to - nregion.from) / hps);
261  return nregion;
262}
263
264#if defined(__linux__)
265bool IsTransparentHugePagesEnabled() {
266  // File format reference:
267  // https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/huge_memory.c?id=13391c60da3308ed9980de0168f74cce6c62ac1d#n163
268  const char* filename = "/sys/kernel/mm/transparent_hugepage/enabled";
269  std::ifstream config_stream(filename, std::ios::in);
270  if (!config_stream.good()) {
271    PrintWarning("could not open /sys/kernel/mm/transparent_hugepage/enabled");
272    return false;
273  }
274
275  std::string token;
276  config_stream >> token;
277  if ("[always]" == token) return true;
278  config_stream >> token;
279  if ("[madvise]" == token) return true;
280  return false;
281}
282#elif defined(__FreeBSD__)
283bool IsSuperPagesEnabled() {
284  // It is enabled by default on amd64.
285  unsigned int super_pages = 0;
286  size_t super_pages_length = sizeof(super_pages);
287  return sysctlbyname("vm.pmap.pg_ps_enabled",
288                      &super_pages,
289                      &super_pages_length,
290                      nullptr,
291                      0) != -1 &&
292         super_pages >= 1;
293}
294#endif
295
296// Functions in this class must always be inlined because they must end up in
297// the `lpstub` section rather than the `.text` section.
298class MemoryMapPointer {
299 public:
300  FORCE_INLINE explicit MemoryMapPointer() {}
301  FORCE_INLINE bool operator==(void* rhs) const { return mem_ == rhs; }
302  FORCE_INLINE void* mem() const { return mem_; }
303  MemoryMapPointer(const MemoryMapPointer&) = delete;
304  MemoryMapPointer(MemoryMapPointer&&) = delete;
305  void operator= (const MemoryMapPointer&) = delete;
306  void operator= (const MemoryMapPointer&&) = delete;
307  FORCE_INLINE void Reset(void* start,
308                          size_t size,
309                          int prot,
310                          int flags,
311                          int fd = -1,
312                          size_t offset = 0) {
313    mem_ = mmap(start, size, prot, flags, fd, offset);
314    size_ = size;
315  }
316  FORCE_INLINE void Reset() {
317    mem_ = nullptr;
318    size_ = 0;
319  }
320  static void SetName(void* mem, size_t size, const char* name) {
321#if defined(__linux__)
322    // Available since the 5.17 kernel release and if the
323    // CONFIG_ANON_VMA_NAME option, we can set an identifier
324    // to an anonymous mapped region. However if the kernel
325    // option is not present or it s an older kernel, it is a no-op.
326    if (mem != MAP_FAILED && mem != nullptr)
327        prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME,
328            reinterpret_cast<uintptr_t>(mem),
329            size,
330            reinterpret_cast<uintptr_t>(name));
331#else
332    (void)name;
333#endif
334  }
335  FORCE_INLINE ~MemoryMapPointer() {
336    if (mem_ == nullptr) return;
337    if (mem_ == MAP_FAILED) return;
338    if (munmap(mem_, size_) == 0) return;
339    PrintSystemError(errno);
340  }
341
342 private:
343  size_t size_ = 0;
344  void* mem_ = nullptr;
345};
346
347}  // End of anonymous namespace
348
349int
350#if !defined(__APPLE__)
351__attribute__((__section__("lpstub")))
352#else
353__attribute__((__section__("__TEXT,__lpstub")))
354#endif
355__attribute__((__aligned__(hps)))
356__attribute__((__noinline__))
357MoveTextRegionToLargePages(const text_region& r) {
358  MemoryMapPointer nmem;
359  MemoryMapPointer tmem;
360  void* start = r.from;
361  size_t size = r.to - r.from;
362
363  // Allocate a temporary region and back up the code we will re-map.
364  nmem.Reset(nullptr, size,
365             PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS);
366  if (nmem.mem() == MAP_FAILED) goto fail;
367  memcpy(nmem.mem(), r.from, size);
368
369#if defined(__linux__)
370// We already know the original page is r-xp
371// (PROT_READ, PROT_EXEC, MAP_PRIVATE)
372// We want PROT_WRITE because we are writing into it.
373// We want it at the fixed address and we use MAP_FIXED.
374  tmem.Reset(start, size,
375             PROT_READ | PROT_WRITE | PROT_EXEC,
376             MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED);
377  if (tmem.mem() == MAP_FAILED) goto fail;
378  if (madvise(tmem.mem(), size, 14 /* MADV_HUGEPAGE */) == -1) goto fail;
379  memcpy(start, nmem.mem(), size);
380#elif defined(__FreeBSD__)
381  tmem.Reset(start, size,
382             PROT_READ | PROT_WRITE | PROT_EXEC,
383             MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED |
384             MAP_ALIGNED_SUPER);
385  if (tmem.mem() == MAP_FAILED) goto fail;
386  memcpy(start, nmem.mem(), size);
387#elif defined(__APPLE__)
388  // There is not enough room to reserve the mapping close
389  // to the region address so we content to give a hint
390  // without forcing the new address being closed to.
391  // We explicitally gives all permission since we plan
392  // to write into it.
393  tmem.Reset(start, size,
394             PROT_READ | PROT_WRITE | PROT_EXEC,
395             MAP_PRIVATE | MAP_ANONYMOUS,
396             VM_FLAGS_SUPERPAGE_SIZE_2MB);
397  if (tmem.mem() == MAP_FAILED) goto fail;
398  memcpy(tmem.mem(), nmem.mem(), size);
399  if (mprotect(start, size, PROT_READ | PROT_WRITE | PROT_EXEC) == -1)
400    goto fail;
401  memcpy(start, tmem.mem(), size);
402#endif
403
404  if (mprotect(start, size, PROT_READ | PROT_EXEC) == -1) goto fail;
405  MemoryMapPointer::SetName(start, size, "nodejs Large Page");
406
407  // We need not `munmap(tmem, size)` on success.
408  tmem.Reset();
409  return 0;
410fail:
411  PrintSystemError(errno);
412  return -1;
413}
414#endif  // defined(NODE_ENABLE_LARGE_CODE_PAGES) && NODE_ENABLE_LARGE_CODE_PAGES
415
416// This is the primary API called from main.
417int MapStaticCodeToLargePages() {
418#if defined(NODE_ENABLE_LARGE_CODE_PAGES) && NODE_ENABLE_LARGE_CODE_PAGES
419  bool have_thp = false;
420#if defined(__linux__)
421  have_thp = IsTransparentHugePagesEnabled();
422#elif defined(__FreeBSD__)
423  have_thp = IsSuperPagesEnabled();
424#elif defined(__APPLE__)
425  // pse-36 flag is present in recent mac x64 products.
426  have_thp = true;
427#endif
428  if (!have_thp)
429    return EACCES;
430
431  struct text_region r = FindNodeTextRegion();
432  if (r.found_text_region == false)
433    return ENOENT;
434
435  return MoveTextRegionToLargePages(r);
436#else
437  return ENOTSUP;
438#endif
439}
440
441const char* LargePagesError(int status) {
442  switch (status) {
443    case ENOTSUP:
444      return "Mapping to large pages is not supported.";
445
446    case EACCES:
447      return "Large pages are not enabled.";
448
449    case ENOENT:
450      return "failed to find text region";
451
452    case -1:
453      return "Mapping code to large pages failed. Reverting to default page "
454          "size.";
455
456    case 0:
457      return "OK";
458
459    default:
460      return "Unknown error";
461  }
462}
463
464}  // namespace node
465