1// Copyright (C) 2018 Intel Corporation 2// 3// Permission is hereby granted, free of charge, to any person obtaining a copy 4// of this software and associated documentation files (the "Software"), 5// to deal in the Software without restriction, including without limitation 6// the rights to use, copy, modify, merge, publish, distribute, sublicense, 7// and/or sell copies of the Software, and to permit persons to whom 8// the Software is furnished to do so, subject to the following conditions: 9// 10// The above copyright notice and this permission notice shall be included 11// in all copies or substantial portions of the Software. 12// 13// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 14// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 16// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES 17// OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 18// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE 19// OR OTHER DEALINGS IN THE SOFTWARE. 20// 21// SPDX-License-Identifier: MIT 22 23// The functions in this file map the .text section of Node.js into 2MB pages. 24// They perform the following steps: 25// 26// 1: Find the Node.js binary's `.text` section in memory. This is done below in 27// `FindNodeTextRegion`. It is accomplished in a platform-specific way. On 28// Linux and FreeBSD, `dl_iterate_phdr(3)` is used. When the region is found, 29// it is "trimmed" as follows: 30// * Modify the start to point to the very beginning of the Node.js `.text` 31// section (from symbol `__node_text_start` declared in node_text_start.S). 32// * Possibly modify the end to account for the `lpstub` section which 33// contains `MoveTextRegionToLargePages`, the function we do not wish to 34// move (see below). 35// * Align the address of the start to its nearest higher large page 36// boundary. 37// * Align the address of the end to its nearest lower large page boundary. 38// 39// 2: Move the text region to large pages. This is done below in 40// `MoveTextRegionToLargePages`. We need to be very careful: 41// a) `MoveTextRegionToLargePages` itself should not be moved. 42// We use gcc attributes 43// (__section__) to put it outside the `.text` section, 44// (__aligned__) to align it at the 2M boundary, and 45// (__noline__) to not inline this function. 46// b) `MoveTextRegionToLargePages` should not call any function(s) that might 47// be moved. 48// To move the .text section, perform the following steps: 49// * Map a new, temporary area and copy the original code there. 50// * Use mmap using the start address with MAP_FIXED so we get exactly the 51// same virtual address (except on OSX). On platforms other than Linux, 52// use mmap flags to request hugepages. 53// * On Linux use madvise with MADV_HUGEPAGE to use anonymous 2MB pages. 54// * If successful copy the code to the newly mapped area and protect it to 55// be readable and executable. 56// * Unmap the temporary area. 57 58#include "node_large_page.h" 59 60#include <cerrno> // NOLINT(build/include) 61 62// Besides returning ENOTSUP at runtime we do nothing if this define is missing. 63#if defined(NODE_ENABLE_LARGE_CODE_PAGES) && NODE_ENABLE_LARGE_CODE_PAGES 64#include "debug_utils-inl.h" 65 66#if defined(__linux__) || defined(__FreeBSD__) 67#if defined(__linux__) 68#ifndef _GNU_SOURCE 69#define _GNU_SOURCE 70#endif // ifndef _GNU_SOURCE 71#include <sys/prctl.h> 72#if !defined(PR_SET_VMA) 73#define PR_SET_VMA 0x53564d41 74#define PR_SET_VMA_ANON_NAME 0 75#endif 76#elif defined(__FreeBSD__) 77#include "uv.h" // uv_exepath 78#endif // defined(__linux__) 79#include <link.h> 80#endif // defined(__linux__) || defined(__FreeBSD__) 81 82#include <sys/types.h> 83#include <sys/mman.h> 84#if defined(__FreeBSD__) 85#include <sys/sysctl.h> 86#elif defined(__APPLE__) 87#include <mach/vm_map.h> 88#endif 89 90#include <climits> // PATH_MAX 91#include <cstdlib> 92#include <cstdint> 93#include <cstring> 94#include <string> 95#include <fstream> 96 97#if defined(__linux__) || defined(__FreeBSD__) 98extern "C" { 99// This symbol must be declared weak because this file becomes part of all 100// Node.js targets (like node_mksnapshot, node_mkcodecache, and cctest) and 101// those files do not supply the symbol. 102extern char __attribute__((weak)) __node_text_start; 103extern char __start_lpstub; 104} // extern "C" 105#endif // defined(__linux__) || defined(__FreeBSD__) 106 107#endif // defined(NODE_ENABLE_LARGE_CODE_PAGES) && NODE_ENABLE_LARGE_CODE_PAGES 108namespace node { 109#if defined(NODE_ENABLE_LARGE_CODE_PAGES) && NODE_ENABLE_LARGE_CODE_PAGES 110 111namespace { 112 113struct text_region { 114 char* from = nullptr; 115 char* to = nullptr; 116 bool found_text_region = false; 117}; 118 119static const size_t hps = 2L * 1024 * 1024; 120 121template <typename... Args> 122inline void Debug(std::string fmt, Args&&... args) { 123 node::Debug(&per_process::enabled_debug_list, 124 DebugCategory::HUGEPAGES, 125 (std::string("Hugepages info: ") + fmt).c_str(), 126 std::forward<Args>(args)...); 127} 128 129inline void PrintWarning(const char* warn) { 130 fprintf(stderr, "Hugepages WARNING: %s\n", warn); 131} 132 133inline void PrintSystemError(int error) { 134 PrintWarning(strerror(error)); 135} 136 137inline uintptr_t hugepage_align_up(uintptr_t addr) { 138 return (((addr) + (hps) - 1) & ~((hps) - 1)); 139} 140 141inline uintptr_t hugepage_align_down(uintptr_t addr) { 142 return ((addr) & ~((hps) - 1)); 143} 144 145#if defined(__linux__) || defined(__FreeBSD__) 146#if defined(__FreeBSD__) 147#ifndef ElfW 148#define ElfW(name) Elf_##name 149#endif // ifndef ElfW 150#endif // defined(__FreeBSD__) 151 152struct dl_iterate_params { 153 uintptr_t start = 0; 154 uintptr_t end = 0; 155 uintptr_t reference_sym = reinterpret_cast<uintptr_t>(&__node_text_start); 156 std::string exename; 157}; 158 159int FindMapping(struct dl_phdr_info* info, size_t, void* data) { 160 auto dl_params = static_cast<dl_iterate_params*>(data); 161 if (dl_params->exename == std::string(info->dlpi_name)) { 162 for (int idx = 0; idx < info->dlpi_phnum; idx++) { 163 const ElfW(Phdr)* phdr = &info->dlpi_phdr[idx]; 164 if (phdr->p_type == PT_LOAD && (phdr->p_flags & PF_X)) { 165 uintptr_t start = info->dlpi_addr + phdr->p_vaddr; 166 uintptr_t end = start + phdr->p_memsz; 167 168 if (dl_params->reference_sym >= start && 169 dl_params->reference_sym <= end) { 170 dl_params->start = start; 171 dl_params->end = end; 172 return 1; 173 } 174 } 175 } 176 } 177 return 0; 178} 179#endif // defined(__linux__) || defined(__FreeBSD__) 180 181struct text_region FindNodeTextRegion() { 182 struct text_region nregion; 183#if defined(__linux__) || defined(__FreeBSD__) 184 dl_iterate_params dl_params; 185 uintptr_t lpstub_start = reinterpret_cast<uintptr_t>(&__start_lpstub); 186 187#if defined(__FreeBSD__) 188 // On FreeBSD we need the name of the binary, because `dl_iterate_phdr` does 189 // not pass in an empty string as the `dlpi_name` of the binary but rather its 190 // absolute path. 191 { 192 char selfexe[PATH_MAX]; 193 size_t count = sizeof(selfexe); 194 if (uv_exepath(selfexe, &count)) 195 return nregion; 196 dl_params.exename = std::string(selfexe, count); 197 } 198#endif // defined(__FreeBSD__) 199 200 if (dl_iterate_phdr(FindMapping, &dl_params) == 1) { 201 Debug("start: %p - sym: %p - end: %p\n", 202 reinterpret_cast<void*>(dl_params.start), 203 reinterpret_cast<void*>(dl_params.reference_sym), 204 reinterpret_cast<void*>(dl_params.end)); 205 206 dl_params.start = dl_params.reference_sym; 207 if (lpstub_start > dl_params.start && lpstub_start <= dl_params.end) { 208 Debug("Trimming end for lpstub: %p\n", 209 reinterpret_cast<void*>(lpstub_start)); 210 dl_params.end = lpstub_start; 211 } 212 213 if (dl_params.start < dl_params.end) { 214 char* from = reinterpret_cast<char*>(hugepage_align_up(dl_params.start)); 215 char* to = reinterpret_cast<char*>(hugepage_align_down(dl_params.end)); 216 Debug("Aligned range is %p - %p\n", from, to); 217 if (from < to) { 218 size_t pagecount = (to - from) / hps; 219 if (pagecount > 0) { 220 nregion.found_text_region = true; 221 nregion.from = from; 222 nregion.to = to; 223 } 224 } 225 } 226 } 227#elif defined(__APPLE__) 228 struct vm_region_submap_info_64 map; 229 mach_msg_type_number_t count = VM_REGION_SUBMAP_INFO_COUNT_64; 230 vm_address_t addr = 0UL; 231 vm_size_t size = 0; 232 natural_t depth = 1; 233 234 while (true) { 235 if (vm_region_recurse_64(mach_task_self(), &addr, &size, &depth, 236 reinterpret_cast<vm_region_info_64_t>(&map), 237 &count) != KERN_SUCCESS) { 238 break; 239 } 240 241 if (map.is_submap) { 242 depth++; 243 } else { 244 char* start = reinterpret_cast<char*>(hugepage_align_up(addr)); 245 char* end = reinterpret_cast<char*>(hugepage_align_down(addr+size)); 246 247 if (end > start && (map.protection & VM_PROT_READ) != 0 && 248 (map.protection & VM_PROT_EXECUTE) != 0) { 249 nregion.found_text_region = true; 250 nregion.from = start; 251 nregion.to = end; 252 break; 253 } 254 255 addr += size; 256 size = 0; 257 } 258 } 259#endif 260 Debug("Found %d huge pages\n", (nregion.to - nregion.from) / hps); 261 return nregion; 262} 263 264#if defined(__linux__) 265bool IsTransparentHugePagesEnabled() { 266 // File format reference: 267 // https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/huge_memory.c?id=13391c60da3308ed9980de0168f74cce6c62ac1d#n163 268 const char* filename = "/sys/kernel/mm/transparent_hugepage/enabled"; 269 std::ifstream config_stream(filename, std::ios::in); 270 if (!config_stream.good()) { 271 PrintWarning("could not open /sys/kernel/mm/transparent_hugepage/enabled"); 272 return false; 273 } 274 275 std::string token; 276 config_stream >> token; 277 if ("[always]" == token) return true; 278 config_stream >> token; 279 if ("[madvise]" == token) return true; 280 return false; 281} 282#elif defined(__FreeBSD__) 283bool IsSuperPagesEnabled() { 284 // It is enabled by default on amd64. 285 unsigned int super_pages = 0; 286 size_t super_pages_length = sizeof(super_pages); 287 return sysctlbyname("vm.pmap.pg_ps_enabled", 288 &super_pages, 289 &super_pages_length, 290 nullptr, 291 0) != -1 && 292 super_pages >= 1; 293} 294#endif 295 296// Functions in this class must always be inlined because they must end up in 297// the `lpstub` section rather than the `.text` section. 298class MemoryMapPointer { 299 public: 300 FORCE_INLINE explicit MemoryMapPointer() {} 301 FORCE_INLINE bool operator==(void* rhs) const { return mem_ == rhs; } 302 FORCE_INLINE void* mem() const { return mem_; } 303 MemoryMapPointer(const MemoryMapPointer&) = delete; 304 MemoryMapPointer(MemoryMapPointer&&) = delete; 305 void operator= (const MemoryMapPointer&) = delete; 306 void operator= (const MemoryMapPointer&&) = delete; 307 FORCE_INLINE void Reset(void* start, 308 size_t size, 309 int prot, 310 int flags, 311 int fd = -1, 312 size_t offset = 0) { 313 mem_ = mmap(start, size, prot, flags, fd, offset); 314 size_ = size; 315 } 316 FORCE_INLINE void Reset() { 317 mem_ = nullptr; 318 size_ = 0; 319 } 320 static void SetName(void* mem, size_t size, const char* name) { 321#if defined(__linux__) 322 // Available since the 5.17 kernel release and if the 323 // CONFIG_ANON_VMA_NAME option, we can set an identifier 324 // to an anonymous mapped region. However if the kernel 325 // option is not present or it s an older kernel, it is a no-op. 326 if (mem != MAP_FAILED && mem != nullptr) 327 prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, 328 reinterpret_cast<uintptr_t>(mem), 329 size, 330 reinterpret_cast<uintptr_t>(name)); 331#else 332 (void)name; 333#endif 334 } 335 FORCE_INLINE ~MemoryMapPointer() { 336 if (mem_ == nullptr) return; 337 if (mem_ == MAP_FAILED) return; 338 if (munmap(mem_, size_) == 0) return; 339 PrintSystemError(errno); 340 } 341 342 private: 343 size_t size_ = 0; 344 void* mem_ = nullptr; 345}; 346 347} // End of anonymous namespace 348 349int 350#if !defined(__APPLE__) 351__attribute__((__section__("lpstub"))) 352#else 353__attribute__((__section__("__TEXT,__lpstub"))) 354#endif 355__attribute__((__aligned__(hps))) 356__attribute__((__noinline__)) 357MoveTextRegionToLargePages(const text_region& r) { 358 MemoryMapPointer nmem; 359 MemoryMapPointer tmem; 360 void* start = r.from; 361 size_t size = r.to - r.from; 362 363 // Allocate a temporary region and back up the code we will re-map. 364 nmem.Reset(nullptr, size, 365 PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS); 366 if (nmem.mem() == MAP_FAILED) goto fail; 367 memcpy(nmem.mem(), r.from, size); 368 369#if defined(__linux__) 370// We already know the original page is r-xp 371// (PROT_READ, PROT_EXEC, MAP_PRIVATE) 372// We want PROT_WRITE because we are writing into it. 373// We want it at the fixed address and we use MAP_FIXED. 374 tmem.Reset(start, size, 375 PROT_READ | PROT_WRITE | PROT_EXEC, 376 MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED); 377 if (tmem.mem() == MAP_FAILED) goto fail; 378 if (madvise(tmem.mem(), size, 14 /* MADV_HUGEPAGE */) == -1) goto fail; 379 memcpy(start, nmem.mem(), size); 380#elif defined(__FreeBSD__) 381 tmem.Reset(start, size, 382 PROT_READ | PROT_WRITE | PROT_EXEC, 383 MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED | 384 MAP_ALIGNED_SUPER); 385 if (tmem.mem() == MAP_FAILED) goto fail; 386 memcpy(start, nmem.mem(), size); 387#elif defined(__APPLE__) 388 // There is not enough room to reserve the mapping close 389 // to the region address so we content to give a hint 390 // without forcing the new address being closed to. 391 // We explicitally gives all permission since we plan 392 // to write into it. 393 tmem.Reset(start, size, 394 PROT_READ | PROT_WRITE | PROT_EXEC, 395 MAP_PRIVATE | MAP_ANONYMOUS, 396 VM_FLAGS_SUPERPAGE_SIZE_2MB); 397 if (tmem.mem() == MAP_FAILED) goto fail; 398 memcpy(tmem.mem(), nmem.mem(), size); 399 if (mprotect(start, size, PROT_READ | PROT_WRITE | PROT_EXEC) == -1) 400 goto fail; 401 memcpy(start, tmem.mem(), size); 402#endif 403 404 if (mprotect(start, size, PROT_READ | PROT_EXEC) == -1) goto fail; 405 MemoryMapPointer::SetName(start, size, "nodejs Large Page"); 406 407 // We need not `munmap(tmem, size)` on success. 408 tmem.Reset(); 409 return 0; 410fail: 411 PrintSystemError(errno); 412 return -1; 413} 414#endif // defined(NODE_ENABLE_LARGE_CODE_PAGES) && NODE_ENABLE_LARGE_CODE_PAGES 415 416// This is the primary API called from main. 417int MapStaticCodeToLargePages() { 418#if defined(NODE_ENABLE_LARGE_CODE_PAGES) && NODE_ENABLE_LARGE_CODE_PAGES 419 bool have_thp = false; 420#if defined(__linux__) 421 have_thp = IsTransparentHugePagesEnabled(); 422#elif defined(__FreeBSD__) 423 have_thp = IsSuperPagesEnabled(); 424#elif defined(__APPLE__) 425 // pse-36 flag is present in recent mac x64 products. 426 have_thp = true; 427#endif 428 if (!have_thp) 429 return EACCES; 430 431 struct text_region r = FindNodeTextRegion(); 432 if (r.found_text_region == false) 433 return ENOENT; 434 435 return MoveTextRegionToLargePages(r); 436#else 437 return ENOTSUP; 438#endif 439} 440 441const char* LargePagesError(int status) { 442 switch (status) { 443 case ENOTSUP: 444 return "Mapping to large pages is not supported."; 445 446 case EACCES: 447 return "Large pages are not enabled."; 448 449 case ENOENT: 450 return "failed to find text region"; 451 452 case -1: 453 return "Mapping code to large pages failed. Reverting to default page " 454 "size."; 455 456 case 0: 457 return "OK"; 458 459 default: 460 return "Unknown error"; 461 } 462} 463 464} // namespace node 465