--- old/src/os/posix/vm/vmError_posix.cpp 2017-02-06 12:02:30.388371000 +0100 +++ new/src/os/posix/vm/vmError_posix.cpp 2017-02-06 12:02:29.880332000 +0100 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003, 2016, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2003, 2017, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it --- old/src/os/windows/vm/vmError_windows.cpp 2017-02-06 12:02:31.270406000 +0100 +++ new/src/os/windows/vm/vmError_windows.cpp 2017-02-06 12:02:31.071408000 +0100 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003, 2016, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2003, 2017, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -67,7 +67,7 @@ } } -// Error reporting canceleation: there is no easy way to implement this on Windows, because we do +// Error reporting cancelleation: there is no easy way to implement this on Windows, because we do // not have an easy way to send signals to threads (aka to cause a win32 Exception in another // thread). We would need something like "RaiseException(HANDLE thread)"... void VMError::reporting_started() {} --- old/src/share/vm/runtime/thread.cpp 2017-02-06 12:02:32.164458000 +0100 +++ new/src/share/vm/runtime/thread.cpp 2017-02-06 12:02:31.949438000 +0100 @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2017, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it --- old/src/share/vm/utilities/vmError.cpp 2017-02-06 12:02:33.436519000 +0100 +++ new/src/share/vm/utilities/vmError.cpp 2017-02-06 12:02:32.924489000 +0100 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003, 2016, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2003, 2017, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -313,36 +313,41 @@ int VMError::_current_step; const char* VMError::_current_step_info; -// Timeout handling. -static volatile jlong reporting_start_time = -1; -static volatile bool reporting_timeout = false; -static volatile jlong step_start_time = -1; -static volatile bool step_timeout = false; - -// Helper function, sets a timestamp to the current time. -static void set_to_now(volatile jlong* p) { - const jlong now = os::javaTimeMillis(); - if (VM_Version::supports_cx8()) { - Atomic::store(now, p); - } else { - *p = now; - } +volatile jlong VMError::_reporting_start_time = -1; +volatile bool VMError::_reporting_did_timeout = false; +volatile jlong VMError::_step_start_time = -1; +volatile bool VMError::_step_did_timeout = false; + +// Helper, return current timestamp for timeout handling. +jlong VMError::get_current_timestamp() { + return os::javaTimeNanos(); } +// Factor to translate the timestamp to seconds. +#define TIMESTAMP_TO_SECONDS_FACTOR (1000 * 1000 * 1000) -// helper function, returns value of time stamp. -static jlong get_timestamp(volatile jlong* p) { - if (VM_Version::supports_cx8()) { - return Atomic::load(p); - } else { - return *p; - } +void VMError::record_reporting_start_time() { + const jlong now = get_current_timestamp(); + Atomic::store(now, &_reporting_start_time); +} + +jlong VMError::get_reporting_start_time() { + return Atomic::load(&_reporting_start_time); +} + +void VMError::record_step_start_time() { + const jlong now = get_current_timestamp(); + Atomic::store(now, &_step_start_time); +} + +jlong VMError::get_step_start_time() { + return Atomic::load(&_step_start_time); } void VMError::report(outputStream* st, bool _verbose) { # define BEGIN if (_current_step == 0) { _current_step = __LINE__; # define STEP(s) } if (_current_step < __LINE__) { _current_step = __LINE__; _current_step_info = s; \ - set_to_now(&step_start_time); step_timeout = false; + record_step_start_time(); _step_did_timeout = false; # define END } // don't allocate large buffer on stack @@ -379,16 +384,17 @@ controlled_crash(TestCrashInErrorHandler); } - // TestUnresponsiveErrorHandler: three times to trigger first a step timeout, then - // the global error reporting timeout. - STEP("test unresponsive error reporting step 1") - if (_verbose && TestUnresponsiveErrorHandler) { os::infinite_sleep(); } - - STEP("test unresponsive error reporting step 2") - if (_verbose && TestUnresponsiveErrorHandler) { os::infinite_sleep(); } - - STEP("test unresponsive error reporting step 3") + // TestUnresponsiveErrorHandler: We want to test both step timeouts and global timeout. + // Step to global timeout ratio is 4:1, so in order to be absolutely sure we hit the + // global timeout, lets execute the timeout step five times. + // See corresponding test in test/runtime/ErrorHandling/TimeoutInErrorHandlingTest.java + #define TIMEOUT_TEST_STEP STEP("test unresponsive error reporting step") \ if (_verbose && TestUnresponsiveErrorHandler) { os::infinite_sleep(); } + TIMEOUT_TEST_STEP + TIMEOUT_TEST_STEP + TIMEOUT_TEST_STEP + TIMEOUT_TEST_STEP + TIMEOUT_TEST_STEP STEP("test safefetch in error handler") // test whether it is safe to use SafeFetch32 in Crash Handler. Test twice @@ -1215,7 +1221,7 @@ set_error_reported(); reporting_started(); - set_to_now(&reporting_start_time); + record_reporting_start_time(); if (ShowMessageBoxOnError || PauseAtExit) { show_message_box(buffer, sizeof(buffer)); @@ -1261,16 +1267,17 @@ st->cr(); // Timeout handling. - if (step_timeout) { + if (_step_did_timeout) { // The current step had a timeout. Lets continue reporting with the next step. st->print_raw("[timeout occurred during error reporting in step \""); st->print_raw(_current_step_info); - st->print_cr("\"] after " INT64_FORMAT " ms.", os::javaTimeMillis() - step_start_time); - } else if (reporting_timeout) { - // We hit ErrorLogTimeout. Reporting will stop altogether. Lets wrap things + st->print_cr("\"] after " INT64_FORMAT " s.", + (get_current_timestamp() - _step_start_time) / TIMESTAMP_TO_SECONDS_FACTOR); + } else if (_reporting_did_timeout) { + // We hit ErrorLogTimeout. Reporting will stop altogether. Let's wrap things // up, the process is about to be stopped by the WatcherThread. - st->print_cr("------ Timout during error reporting after " INT64_FORMAT "ms. ------", - os::javaTimeMillis() - reporting_start_time); + st->print_cr("------ Timeout during error reporting after " INT64_FORMAT " s. ------", + (get_current_timestamp() - _reporting_start_time) / TIMESTAMP_TO_SECONDS_FACTOR); st->flush(); // Watcherthread is about to call os::die. Lets just wait. os::infinite_sleep(); @@ -1493,26 +1500,27 @@ return false; } - const jlong reporting_start_time_l = get_timestamp(&reporting_start_time); - const jlong now = os::javaTimeMillis(); + const jlong reporting_start_time_l = get_reporting_start_time(); + const jlong now = get_current_timestamp(); + // Timestamp is stored in nanos. if (reporting_start_time_l > 0) { - const jlong end = reporting_start_time_l + ErrorLogTimeout * 1000; + const jlong end = reporting_start_time_l + (jlong)ErrorLogTimeout * TIMESTAMP_TO_SECONDS_FACTOR; if (end <= now) { - reporting_timeout = true; - VMError::interrupt_reporting_thread(); + _reporting_did_timeout = true; + interrupt_reporting_thread(); return true; // global timeout } } - const jlong step_start_time_l = get_timestamp(&step_start_time); + const jlong step_start_time_l = get_step_start_time(); if (step_start_time_l > 0) { - // A step times out after half of the total timeout. Steps are mostly fast unless they - // hang for some reason, so this simple rule allows for one hanging step and still - // leaves time enough for the rest of the steps to finish. - const jlong end = step_start_time_l + ErrorLogTimeout * 1000 / 2; + // A step times out after a quarter of the total timeout. Steps are mostly fast unless they + // hang for some reason, so this simple rule allows for three hanging step and still + // hopefully leaves time enough for the rest of the steps to finish. + const jlong end = step_start_time_l + (jlong)ErrorLogTimeout * TIMESTAMP_TO_SECONDS_FACTOR / 4; if (end <= now) { - step_timeout = true; - VMError::interrupt_reporting_thread(); + _step_did_timeout = true; + interrupt_reporting_thread(); return false; // (Not a global timeout) } } --- old/src/share/vm/utilities/vmError.hpp 2017-02-06 12:02:34.274558000 +0100 +++ new/src/share/vm/utilities/vmError.hpp 2017-02-06 12:02:34.072552000 +0100 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003, 2016, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2003, 2017, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -74,6 +74,16 @@ // no core/minidump has been written to disk static char coredump_message[O_BUFLEN]; + // Timeout handling: + // Timestamp at which error reporting started; -1 if no error reporting in progress. + static volatile jlong _reporting_start_time; + // Whether or not error reporting did timeout. + static volatile bool _reporting_did_timeout; + // Timestamp at which the last error reporting step started; -1 if no error reporting + // in progress. + static volatile jlong _step_start_time; + // Whether or not the last error reporting step did timeout. + static volatile bool _step_did_timeout; // set signal handlers on Solaris/Linux or the default exception filter // on Windows, to handle recursive crashes. @@ -104,10 +114,20 @@ static fdStream out; static fdStream log; // error log used by VMError::report_and_die() - // Platform dependend helper functions used for timeout handling. + // Timeout handling. + // Hook functions for platform dependend functionality: static void reporting_started(); static void interrupt_reporting_thread(); + // Helper function to get the current timestamp. + static jlong get_current_timestamp(); + + // Accessors to get/set the start times for step and total timeout. + static void record_reporting_start_time(); + static jlong get_reporting_start_time(); + static void record_step_start_time(); + static jlong get_step_start_time(); + public: // return a string to describe the error @@ -152,8 +172,8 @@ static intptr_t get_first_error_tid() { return first_error_tid; } - // Called by WatcherThread to check if the currently running error reporting did timeout. - // Returns true if reporting did hit the global ErrorLogTimeout. + // Called by the WatcherThread to check if error reporting has timed-out. + // Returns true if error reporting has not completed within the ErrorLogTimeout limit. static bool check_timeout(); }; --- old/test/runtime/ErrorHandling/TimeoutInErrorHandlingTest.java 2017-02-06 12:02:35.193606000 +0100 +++ new/test/runtime/ErrorHandling/TimeoutInErrorHandlingTest.java 2017-02-06 12:02:34.980595000 +0100 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2017 Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -37,7 +37,7 @@ * @summary Hanging Error Reporting steps may lead to torn error logs * @modules java.base/jdk.internal.misc * @library /test/lib - * @requires vm.debug == true + * @requires (vm.debug == true) & (os.family != "windows") * @author Thomas Stuefe (SAP) */ @@ -47,31 +47,32 @@ public static void main(String[] args) throws Exception { /* Start the VM and let it crash. Specify TestUnresponsiveErrorHandler which will - * let three subsequent error reporting steps hang. The Timeout handling triggered - * by the WatcherThread should kick in and interrupt those steps. + * let five subsequent error reporting steps hang. The Timeout handling triggered + * by the WatcherThread should kick in and interrupt those steps. In theory, the + * text "timeout occurred during error reporting in step .." (the little timeouts) + * should occur in the error log up to four times, followed by the final big timeout + * "------ Timeout during error reporting after xx s. ------" * - * Timeout in Error Reporting is controlled by ErrorLogTimeout, which defines after - * how many seconds error reporting is stopped altogether and the VM is aborted. - * Each error reporting step is granted up to half of this time before it is canceled - * and the next step is started (the logic being, typically error steps are - * instantaneous, if they hang, its typically an error which only should affect one - * or two steps). - * So, by causing three subsequent steps to hang, we expect the error log to contain - * at least one, possibly two step-timeout messages, and then the global ErrorLogTimeout - * should have been reached and the VM should have been aborted by the WatcherThread, - * which should write "timer expired, abort..." to stderr. */ - - // Not debug, not windows (yet) - if (!Platform.isDebugBuild() || Platform.isWindows()) { - return; - } + * Note that there are a number of uncertainties which make writing a 100% foolproof + * test challenging. The time the error reporting thread takes to react to the + * timeout triggers is unknown. So it is difficult to predict how many little timeouts + * will be visible before the big timeout kicks in. Also, once the big timeout hits, + * error reporting thread and Watcherthread will race. The former writes his last + * message to the error logs and flushes, the latter waits 200ms and then exits the + * process without further synchronization with the error reporting thread. + * + * Because of all this and the desire to write a bullet proof test which does + * not fail sporadically, we will not test for the final timeout message nor for all + * of the optimally expected little timeout messages. We just test for two of the + * little timeout messages to see that repeated timeout handling is basically working. + */ ProcessBuilder pb = ProcessTools.createJavaProcessBuilder( "-XX:+UnlockDiagnosticVMOptions", "-Xmx100M", "-XX:ErrorHandlerTest=14", "-XX:+TestUnresponsiveErrorHandler", - "-XX:ErrorLogTimeout=10", // 10 seconds + "-XX:ErrorLogTimeout=16", // 16 seconds big timeout = 4 seconds per little timeout "-XX:-CreateCoredumpOnCrash", "-version"); @@ -102,7 +103,10 @@ BufferedReader br = new BufferedReader(new InputStreamReader(fis)); String line = null; + + Pattern [] pattern = new Pattern[] { + Pattern.compile(".*timeout occurred during error reporting in step.*"), Pattern.compile(".*timeout occurred during error reporting in step.*") }; int currentPattern = 0;