--- old/src/os/windows/vm/os_windows.cpp 2011-08-23 12:19:12.829786000 -0500 +++ new/src/os/windows/vm/os_windows.cpp 2011-08-23 12:19:12.459762000 -0500 @@ -1964,6 +1964,7 @@ } } + int os::signal_lookup() { return check_pending_signals(false); } @@ -2614,6 +2615,49 @@ static HANDLE _hProcess; static HANDLE _hToken; +// Container for NUMA node list info +class NUMANodeListHolder { +private: + int *_numa_used_node_list; // allocated below + int _numa_used_node_count; +public: + NUMANodeListHolder() { + _numa_used_node_count = 0; + _numa_used_node_list = NULL; + // do rest of initialization in build routine (after function pointers are set up) + } + + ~NUMANodeListHolder() { + FREE_C_HEAP_ARRAY(int, _numa_used_node_list); + } + + bool build() { + DWORD_PTR proc_aff_mask; + DWORD_PTR sys_aff_mask; + if (!GetProcessAffinityMask(GetCurrentProcess(), &proc_aff_mask, &sys_aff_mask)) return false; + ULONG highest_node_number; + if (!os::Kernel32Dll::GetNumaHighestNodeNumber(&highest_node_number)) return false; + _numa_used_node_list = NEW_C_HEAP_ARRAY(int, highest_node_number); + for (unsigned int i = 0; i <= highest_node_number; i++) { + ULONGLONG proc_mask_numa_node; + if (!os::Kernel32Dll::GetNumaNodeProcessorMask(i, &proc_mask_numa_node)) return false; + if ((proc_aff_mask & proc_mask_numa_node)!=0) { + _numa_used_node_list[_numa_used_node_count++] = i; + } + } + return (_numa_used_node_count > 1); + } + + int get_count() {return _numa_used_node_count;} + int get_node_list_entry(int n) { + // for indexes out of range, returns -1 + return (n < _numa_used_node_count ? _numa_used_node_list[n] : -1); + } + +} numa_node_list_holder; + + + static size_t _large_page_size = 0; static bool resolve_functions_for_large_page_init() { @@ -2653,6 +2697,154 @@ _hToken = NULL; } +static bool numa_interleaving_init() { + bool success = false; + bool use_numa_specified = !FLAG_IS_DEFAULT(UseNUMA); + bool use_numa_interleaving_specified = !FLAG_IS_DEFAULT(UseNUMAInterleaving); + + // print a warning if UseNUMA or UseNUMAInterleaving flag is specified on command line + bool warn_on_failure = use_numa_specified || use_numa_interleaving_specified; +# define WARN(msg) if (warn_on_failure) { warning(msg); } + + // NUMAInterleaveGranularity cannot be less than vm_allocation_granularity (or _large_page_size if using large pages) + size_t min_interleave_granularity = UseLargePages ? _large_page_size : os::vm_allocation_granularity(); + NUMAInterleaveGranularity = align_size_up(NUMAInterleaveGranularity, min_interleave_granularity); + + if (os::Kernel32Dll::NumaCallsAvailable()) { + if (numa_node_list_holder.build()) { + if (PrintMiscellaneous && Verbose) { + tty->print("NUMA UsedNodeCount=%d, namely ", os::numa_get_groups_num()); + for (int i = 0; i < os::numa_get_groups_num(); i++) { + tty->print("%d ", numa_node_list_holder.get_node_list_entry(i)); + } + tty->print("\n"); + } + success = true; + } else { + WARN("Process does not cover multiple NUMA nodes."); + } + } else { + WARN("NUMA Interleaving is not supported by the operating system."); + } + if (!success) { + if (use_numa_specified) WARN("...Ignoring UseNUMA flag."); + if (use_numa_interleaving_specified) WARN("...Ignoring UseNUMAInterleaving flag."); + } + return success; +#undef WARN +} + +// this routine is used whenever we need to reserve a contiguous VA range +// but we need to make separate VirtualAlloc calls for each piece of the range +// Reasons for doing this: +// * UseLargePagesIndividualAllocation was set (normally only needed on WS2003 but possible to be set otherwise) +// * UseNUMAInterleaving requires a separate node for each piece +static char* allocate_pages_individually(size_t bytes, char* addr, DWORD flags, DWORD prot, + bool should_inject_error=false) { + char * p_buf; + // note: at setup time we guaranteed that NUMAInterleaveGranularity was aligned up to a page size + size_t page_size = UseLargePages ? _large_page_size : os::vm_allocation_granularity(); + size_t chunk_size = UseNUMAInterleaving ? NUMAInterleaveGranularity : page_size; + + // first reserve enough address space in advance since we want to be + // able to break a single contiguous virtual address range into multiple + // large page commits but WS2003 does not allow reserving large page space + // so we just use 4K pages for reserve, this gives us a legal contiguous + // address space. then we will deallocate that reservation, and re alloc + // using large pages + const size_t size_of_reserve = bytes + chunk_size; + if (bytes > size_of_reserve) { + // Overflowed. + return NULL; + } + p_buf = (char *) VirtualAlloc(addr, + size_of_reserve, // size of Reserve + MEM_RESERVE, + PAGE_READWRITE); + // If reservation failed, return NULL + if (p_buf == NULL) return NULL; + + os::release_memory(p_buf, bytes + chunk_size); + + // we still need to round up to a page boundary (in case we are using large pages) + // but not to a chunk boundary (in case InterleavingGranularity doesn't align with page size) + // instead we handle this in the bytes_to_rq computation below + p_buf = (char *) align_size_up((size_t)p_buf, page_size); + + // now go through and allocate one chunk at a time until all bytes are + // allocated + size_t bytes_remaining = bytes; + // An overflow of align_size_up() would have been caught above + // in the calculation of size_of_reserve. + char * next_alloc_addr = p_buf; + HANDLE hProc = GetCurrentProcess(); + +#ifdef ASSERT + // Variable for the failure injection + long ran_num = os::random(); + size_t fail_after = ran_num % bytes; +#endif + + int count=0; + while (bytes_remaining) { + // select bytes_to_rq to get to the next chunk_size boundary + + size_t bytes_to_rq = MIN2(bytes_remaining, chunk_size - ((size_t)next_alloc_addr % chunk_size)); + // Note allocate and commit + char * p_new; + +#ifdef ASSERT + bool inject_error_now = should_inject_error && (bytes_remaining <= fail_after); +#else + const bool inject_error_now = false; +#endif + + if (inject_error_now) { + p_new = NULL; + } else { + if (!UseNUMAInterleaving) { + p_new = (char *) VirtualAlloc(next_alloc_addr, + bytes_to_rq, + flags, + prot); + } else { + // get the next node to use from the used_node_list + DWORD node = numa_node_list_holder.get_node_list_entry(count % os::numa_get_groups_num()); + p_new = (char *)os::Kernel32Dll::VirtualAllocExNuma(hProc, + next_alloc_addr, + bytes_to_rq, + flags, + prot, + node); + } + } + + if (p_new == NULL) { + // Free any allocated pages + if (next_alloc_addr > p_buf) { + // Some memory was committed so release it. + size_t bytes_to_release = bytes - bytes_remaining; + os::release_memory(p_buf, bytes_to_release); + } +#ifdef ASSERT + if (should_inject_error) { + if (TracePageSizes && Verbose) { + tty->print_cr("Reserving pages individually failed."); + } + } +#endif + return NULL; + } + bytes_remaining -= bytes_to_rq; + next_alloc_addr += bytes_to_rq; + count++; + } + // made it this far, success + return p_buf; +} + + + void os::large_page_init() { if (!UseLargePages) return; @@ -2722,9 +2914,27 @@ assert((size_t)addr % os::vm_allocation_granularity() == 0, "reserve alignment"); assert(bytes % os::vm_allocation_granularity() == 0, "reserve block size"); - char* res = (char*)VirtualAlloc(addr, bytes, MEM_RESERVE, PAGE_READWRITE); + char* res; + // note that if UseLargePages is on, all the areas that require interleaving + // will go thru reserve_memory_special rather than thru here. + bool use_individual = (UseNUMAInterleaving && !UseLargePages); + if (!use_individual) { + res = (char*)VirtualAlloc(addr, bytes, MEM_RESERVE, PAGE_READWRITE); + } else { + elapsedTimer reserveTimer; + if( Verbose && PrintMiscellaneous ) reserveTimer.start(); + // in numa interleaving, we have to allocate pages individually + // (well really chunks of allocation granularity size) + res = allocate_pages_individually(bytes, addr, MEM_RESERVE, PAGE_READWRITE); + if( Verbose && PrintMiscellaneous ) { + reserveTimer.stop(); + tty->print_cr("reserve_memory of %Ix bytes took %ld ms (%ld ticks)", bytes, + reserveTimer.milliseconds(), reserveTimer.ticks()); + } + } assert(res == NULL || addr == NULL || addr == res, "Unexpected address from reserve."); + return res; } @@ -2754,92 +2964,21 @@ char* os::reserve_memory_special(size_t bytes, char* addr, bool exec) { const DWORD prot = exec ? PAGE_EXECUTE_READWRITE : PAGE_READWRITE; + const DWORD flags = MEM_RESERVE | MEM_COMMIT | MEM_LARGE_PAGES; - if (UseLargePagesIndividualAllocation) { + // with large pages, there are two cases where we need to use Individual Allocation + // 1) the UseLargePagesIndividualAllocation flag is set (set by default on WS2003) + // 2) NUMA Interleaving is enabled, in which case we use a different node for each page + if (UseLargePagesIndividualAllocation || UseNUMAInterleaving) { if (TracePageSizes && Verbose) { tty->print_cr("Reserving large pages individually."); } - char * p_buf; - // first reserve enough address space in advance since we want to be - // able to break a single contiguous virtual address range into multiple - // large page commits but WS2003 does not allow reserving large page space - // so we just use 4K pages for reserve, this gives us a legal contiguous - // address space. then we will deallocate that reservation, and re alloc - // using large pages - const size_t size_of_reserve = bytes + _large_page_size; - if (bytes > size_of_reserve) { - // Overflowed. - warning("Individually allocated large pages failed, " - "use -XX:-UseLargePagesIndividualAllocation to turn off"); + char * p_buf = allocate_pages_individually(bytes, addr, flags, prot, LargePagesIndividualAllocationInjectError); + if (p_buf == NULL) { + warning("Individually allocated pages failed, " + "use -XX:-UseLargePagesIndividualAllocation to turn off"); return NULL; } - p_buf = (char *) VirtualAlloc(addr, - size_of_reserve, // size of Reserve - MEM_RESERVE, - PAGE_READWRITE); - // If reservation failed, return NULL - if (p_buf == NULL) return NULL; - - release_memory(p_buf, bytes + _large_page_size); - // round up to page boundary. If the size_of_reserve did not - // overflow and the reservation did not fail, this align up - // should not overflow. - p_buf = (char *) align_size_up((size_t)p_buf, _large_page_size); - - // now go through and allocate one page at a time until all bytes are - // allocated - size_t bytes_remaining = align_size_up(bytes, _large_page_size); - // An overflow of align_size_up() would have been caught above - // in the calculation of size_of_reserve. - char * next_alloc_addr = p_buf; - -#ifdef ASSERT - // Variable for the failure injection - long ran_num = os::random(); - size_t fail_after = ran_num % bytes; -#endif - - while (bytes_remaining) { - size_t bytes_to_rq = MIN2(bytes_remaining, _large_page_size); - // Note allocate and commit - char * p_new; - -#ifdef ASSERT - bool inject_error = LargePagesIndividualAllocationInjectError && - (bytes_remaining <= fail_after); -#else - const bool inject_error = false; -#endif - - if (inject_error) { - p_new = NULL; - } else { - p_new = (char *) VirtualAlloc(next_alloc_addr, - bytes_to_rq, - MEM_RESERVE | MEM_COMMIT | MEM_LARGE_PAGES, - prot); - } - - if (p_new == NULL) { - // Free any allocated pages - if (next_alloc_addr > p_buf) { - // Some memory was committed so release it. - size_t bytes_to_release = bytes - bytes_remaining; - release_memory(p_buf, bytes_to_release); - } -#ifdef ASSERT - if (UseLargePagesIndividualAllocation && - LargePagesIndividualAllocationInjectError) { - if (TracePageSizes && Verbose) { - tty->print_cr("Reserving large pages individually failed."); - } - } -#endif - return NULL; - } - bytes_remaining -= bytes_to_rq; - next_alloc_addr += bytes_to_rq; - } return p_buf; @@ -2867,14 +3006,40 @@ assert(bytes % os::vm_page_size() == 0, "commit in page-sized chunks"); // Don't attempt to print anything if the OS call fails. We're // probably low on resources, so the print itself may cause crashes. - bool result = VirtualAlloc(addr, bytes, MEM_COMMIT, PAGE_READWRITE) != 0; - if (result != NULL && exec) { - DWORD oldprot; - // Windows doc says to use VirtualProtect to get execute permissions - return VirtualProtect(addr, bytes, PAGE_EXECUTE_READWRITE, &oldprot) != 0; + + // unless we have NUMAInterleavaing enabled, the range of a commit + // is always within a reserve covered by a single VirtualAlloc + // in that case we can just do a single commit for the requested size + if (!UseNUMAInterleaving) { + if (VirtualAlloc(addr, bytes, MEM_COMMIT, PAGE_READWRITE) == NULL) return false; + if (exec) { + DWORD oldprot; + // Windows doc says to use VirtualProtect to get execute permissions + if (!VirtualProtect(addr, bytes, PAGE_EXECUTE_READWRITE, &oldprot)) return false; + } + return true; } else { - return result; + // when NUMAInterleaving is enabled, the commit might cover a range that + // came from multiple VirtualAlloc reserves using allocate_pages_individually + // VirtualQuery can help us determine that, the RegionSize that it returns + // represents the number of bytes thNUMANodeListHolderat can be committed in one step. + size_t bytes_remaining = bytes; + char * next_alloc_addr = addr; + while (bytes_remaining > 0) { + MEMORY_BASIC_INFORMATION alloc_info; + VirtualQuery(next_alloc_addr, &alloc_info, sizeof(alloc_info)); + size_t bytes_to_rq = MIN2(bytes_remaining, alloc_info.RegionSize); + if (VirtualAlloc(next_alloc_addr, bytes_to_rq, MEM_COMMIT, PAGE_READWRITE) == NULL) return false; + if (exec) { + DWORD oldprot; + if (!VirtualProtect(next_alloc_addr, bytes_to_rq, PAGE_EXECUTE_READWRITE, &oldprot)) return false; + } + bytes_remaining -= bytes_to_rq; + next_alloc_addr += bytes_to_rq; + } } + // if we made it this far, return true + return true; } bool os::commit_memory(char* addr, size_t size, size_t alignment_hint, @@ -2948,14 +3113,13 @@ void os::numa_make_global(char *addr, size_t bytes) { } void os::numa_make_local(char *addr, size_t bytes, int lgrp_hint) { } bool os::numa_topology_changed() { return false; } -size_t os::numa_get_groups_num() { return 1; } +size_t os::numa_get_groups_num() { return numa_node_list_holder.get_count(); } int os::numa_get_group_id() { return 0; } size_t os::numa_get_leaf_groups(int *ids, size_t size) { - if (size > 0) { - ids[0] = 0; - return 1; + for (int i = 0; i < size; i++) { + ids[i] = numa_node_list_holder.get_node_list_entry(i); } - return 0; + return size; } bool os::get_page_info(char *start, page_info* info) { @@ -3480,7 +3644,7 @@ if(Verbose && PrintMiscellaneous) tty->print("[Memory Serialize Page address: " INTPTR_FORMAT "]\n", (intptr_t)mem_serialize_page); #endif -} + } os::large_page_init(); @@ -3583,8 +3747,10 @@ // initialize thread priority policy prio_init(); - if (UseNUMA && !ForceNUMA) { - UseNUMA = false; // Currently unsupported. + if (UseNUMAInterleaving) { + // first check whether this Windows OS supports VirtualAllocExNuma, if not ignore this flag + bool success = numa_interleaving_init(); + if (!success) UseNUMAInterleaving = false; } return JNI_OK; @@ -4758,7 +4924,14 @@ // Kernel32 API typedef SIZE_T (WINAPI* GetLargePageMinimum_Fn)(void); +typedef LPVOID (WINAPI *VirtualAllocExNuma_Fn) (HANDLE, LPVOID, SIZE_T, DWORD, DWORD, DWORD); +typedef BOOL (WINAPI *GetNumaHighestNodeNumber_Fn) (PULONG); +typedef BOOL (WINAPI *GetNumaNodeProcessorMask_Fn) (UCHAR, PULONGLONG); + GetLargePageMinimum_Fn os::Kernel32Dll::_GetLargePageMinimum = NULL; +VirtualAllocExNuma_Fn os::Kernel32Dll::_VirtualAllocExNuma = NULL; +GetNumaHighestNodeNumber_Fn os::Kernel32Dll::_GetNumaHighestNodeNumber = NULL; +GetNumaNodeProcessorMask_Fn os::Kernel32Dll::_GetNumaNodeProcessorMask = NULL; BOOL os::Kernel32Dll::initialized = FALSE; SIZE_T os::Kernel32Dll::GetLargePageMinimum() { assert(initialized && _GetLargePageMinimum != NULL, @@ -4773,19 +4946,56 @@ return _GetLargePageMinimum != NULL; } +BOOL os::Kernel32Dll::NumaCallsAvailable() { + if (!initialized) { + initialize(); + } + return _VirtualAllocExNuma != NULL; +} + +LPVOID os::Kernel32Dll::VirtualAllocExNuma(HANDLE hProc, LPVOID addr, SIZE_T bytes, DWORD flags, DWORD prot, DWORD node) { + assert(initialized && _VirtualAllocExNuma != NULL, + "NUMACallsAvailable() not yet called"); -#ifndef JDK6_OR_EARLIER + return _VirtualAllocExNuma(hProc, addr, bytes, flags, prot, node); +} -void os::Kernel32Dll::initialize() { +BOOL os::Kernel32Dll::GetNumaHighestNodeNumber(PULONG ptr_highest_node_number) { + assert(initialized && _GetNumaHighestNodeNumber != NULL, + "NUMACallsAvailable() not yet called"); + + return _GetNumaHighestNodeNumber(ptr_highest_node_number); +} + +BOOL os::Kernel32Dll::GetNumaNodeProcessorMask(UCHAR node, PULONGLONG proc_mask) { + assert(initialized && _GetNumaNodeProcessorMask != NULL, + "NUMACallsAvailable() not yet called"); + + return _GetNumaNodeProcessorMask(node, proc_mask); +} + + +void os::Kernel32Dll::initializeCommon() { if (!initialized) { HMODULE handle = ::GetModuleHandle("Kernel32.dll"); assert(handle != NULL, "Just check"); _GetLargePageMinimum = (GetLargePageMinimum_Fn)::GetProcAddress(handle, "GetLargePageMinimum"); + _VirtualAllocExNuma = (VirtualAllocExNuma_Fn)::GetProcAddress(handle, "VirtualAllocExNuma"); + _GetNumaHighestNodeNumber = (GetNumaHighestNodeNumber_Fn)::GetProcAddress(handle, "GetNumaHighestNodeNumber"); + _GetNumaNodeProcessorMask = (GetNumaNodeProcessorMask_Fn)::GetProcAddress(handle, "GetNumaNodeProcessorMask"); initialized = TRUE; } } + +#ifndef JDK6_OR_EARLIER + +void os::Kernel32Dll::initialize() { + initializeCommon(); +} + + // Kernel32 API inline BOOL os::Kernel32Dll::SwitchToThread() { return ::SwitchToThread(); @@ -4887,18 +5097,19 @@ Module32Next_Fn os::Kernel32Dll::_Module32Next = NULL; GetNativeSystemInfo_Fn os::Kernel32Dll::_GetNativeSystemInfo = NULL; + void os::Kernel32Dll::initialize() { if (!initialized) { HMODULE handle = ::GetModuleHandle("Kernel32.dll"); assert(handle != NULL, "Just check"); _SwitchToThread = (SwitchToThread_Fn)::GetProcAddress(handle, "SwitchToThread"); - _GetLargePageMinimum = (GetLargePageMinimum_Fn)::GetProcAddress(handle, "GetLargePageMinimum"); _CreateToolhelp32Snapshot = (CreateToolhelp32Snapshot_Fn) ::GetProcAddress(handle, "CreateToolhelp32Snapshot"); _Module32First = (Module32First_Fn)::GetProcAddress(handle, "Module32First"); _Module32Next = (Module32Next_Fn)::GetProcAddress(handle, "Module32Next"); _GetNativeSystemInfo = (GetNativeSystemInfo_Fn)::GetProcAddress(handle, "GetNativeSystemInfo"); + initializeCommon(); // resolve the functions that always need resolving initialized = TRUE; } @@ -4964,6 +5175,8 @@ _GetNativeSystemInfo(lpSystemInfo); } + + // PSAPI API