< prev index next >

src/share/vm/utilities/vmError.cpp

Print this page
rev 12487 : 8166944: Hanging Error Reporting steps may lead to torn error logs.
Reviewed-by: cjplummer, dholmes
Summary: Interupt error reporting if reporting steps hang to enable subsequent reporting steps to run.
rev 12488 : [mq]: 8166944-Hanging-Error-Reporting-2
   1 /*
   2  * Copyright (c) 2003, 2016, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *


 296 // information that may be unsafe to get after a fatal error. If it happens,
 297 // you may find nested report_and_die() frames when you look at the stack
 298 // in a debugger.
 299 //
 300 // In general, a hang in error handler is much worse than a crash or internal
 301 // error, as it's harder to recover from a hang. Deadlock can happen if we
 302 // try to grab a lock that is already owned by current thread, or if the
 303 // owner is blocked forever (e.g. in os::infinite_sleep()). If possible, the
 304 // error handler and all the functions it called should avoid grabbing any
 305 // lock. An important thing to notice is that memory allocation needs a lock.
 306 //
 307 // We should avoid using large stack allocated buffers. Many errors happen
 308 // when stack space is already low. Making things even worse is that there
 309 // could be nested report_and_die() calls on stack (see above). Only one
 310 // thread can report error, so large buffers are statically allocated in data
 311 // segment.
 312 
 313 int          VMError::_current_step;
 314 const char*  VMError::_current_step_info;
 315 
 316 // Timeout handling.
 317 static volatile jlong reporting_start_time = -1;
 318 static volatile bool reporting_timeout = false;
 319 static volatile jlong step_start_time = -1;
 320 static volatile bool step_timeout = false;
 321 
 322 // Helper function, sets a timestamp to the current time.
 323 static void set_to_now(volatile jlong* p) {
 324   const jlong now = os::javaTimeMillis();
 325   if (VM_Version::supports_cx8()) {
 326     Atomic::store(now, p);
 327   } else {
 328     *p = now;
 329   }
 330 }


 331 
 332 // helper function, returns value of time stamp.
 333 static jlong get_timestamp(volatile jlong* p) {
 334   if (VM_Version::supports_cx8()) {
 335     return Atomic::load(p);
 336   } else {
 337     return *p;
 338   }









 339 }
 340 
 341 void VMError::report(outputStream* st, bool _verbose) {
 342 
 343 # define BEGIN if (_current_step == 0) { _current_step = __LINE__;
 344 # define STEP(s) } if (_current_step < __LINE__) { _current_step = __LINE__; _current_step_info = s; \
 345   set_to_now(&step_start_time); step_timeout = false;
 346 # define END }
 347 
 348   // don't allocate large buffer on stack
 349   static char buf[O_BUFLEN];
 350 
 351   BEGIN
 352 
 353   STEP("printing fatal error message")
 354 
 355     st->print_cr("#");
 356     if (should_report_bug(_id)) {
 357       st->print_cr("# A fatal error has been detected by the Java Runtime Environment:");
 358     } else {
 359       st->print_cr("# There is insufficient memory for the Java "
 360                    "Runtime Environment to continue.");
 361     }
 362 
 363 #ifndef PRODUCT
 364   // Error handler self tests
 365 
 366   // test secondary error handling. Test it twice, to test that resetting
 367   // error handler after a secondary crash works.
 368   STEP("test secondary crash 1")
 369     if (_verbose && TestCrashInErrorHandler != 0) {
 370       st->print_cr("Will crash now (TestCrashInErrorHandler=" UINTX_FORMAT ")...",
 371         TestCrashInErrorHandler);
 372       controlled_crash(TestCrashInErrorHandler);
 373     }
 374 
 375   STEP("test secondary crash 2")
 376     if (_verbose && TestCrashInErrorHandler != 0) {
 377       st->print_cr("Will crash now (TestCrashInErrorHandler=" UINTX_FORMAT ")...",
 378         TestCrashInErrorHandler);
 379       controlled_crash(TestCrashInErrorHandler);
 380     }
 381 
 382   // TestUnresponsiveErrorHandler: three times to trigger first a step timeout, then
 383   // the global error reporting timeout.
 384   STEP("test unresponsive error reporting step 1")
 385     if (_verbose && TestUnresponsiveErrorHandler) { os::infinite_sleep(); }
 386 
 387   STEP("test unresponsive error reporting step 2")
 388     if (_verbose && TestUnresponsiveErrorHandler) { os::infinite_sleep(); }
 389 
 390   STEP("test unresponsive error reporting step 3")
 391     if (_verbose && TestUnresponsiveErrorHandler) { os::infinite_sleep(); }





 392 
 393   STEP("test safefetch in error handler")
 394     // test whether it is safe to use SafeFetch32 in Crash Handler. Test twice
 395     // to test that resetting the signal handler works correctly.
 396     if (_verbose && TestSafeFetchInErrorHandler) {
 397       st->print_cr("Will test SafeFetch...");
 398       if (CanUseSafeFetch32()) {
 399         int* const invalid_pointer = (int*) get_segfault_address();
 400         const int x = 0x76543210;
 401         int i1 = SafeFetch32(invalid_pointer, x);
 402         int i2 = SafeFetch32(invalid_pointer, x);
 403         if (i1 == x && i2 == x) {
 404           st->print_cr("SafeFetch OK."); // Correctly deflected and returned default pattern
 405         } else {
 406           st->print_cr("??");
 407         }
 408       } else {
 409         st->print_cr("not possible; skipped.");
 410       }
 411     }


1198 
1199     // Initialize time stamps to use the same base.
1200     out.time_stamp().update_to(1);
1201     log.time_stamp().update_to(1);
1202 
1203     _id = id;
1204     _message = message;
1205     _thread = thread;
1206     _pc = pc;
1207     _siginfo = siginfo;
1208     _context = context;
1209     _filename = filename;
1210     _lineno = lineno;
1211     _size = size;
1212     jio_vsnprintf(_detail_msg, sizeof(_detail_msg), detail_fmt, detail_args);
1213 
1214     // first time
1215     set_error_reported();
1216 
1217     reporting_started();
1218     set_to_now(&reporting_start_time);
1219 
1220     if (ShowMessageBoxOnError || PauseAtExit) {
1221       show_message_box(buffer, sizeof(buffer));
1222 
1223       // User has asked JVM to abort. Reset ShowMessageBoxOnError so the
1224       // WatcherThread can kill JVM if the error handler hangs.
1225       ShowMessageBoxOnError = false;
1226     }
1227 
1228     os::check_dump_limit(buffer, sizeof(buffer));
1229 
1230     // reset signal handlers or exception filter; make sure recursive crashes
1231     // are handled properly.
1232     reset_signal_handlers();
1233 
1234     TRACE_VM_ERROR();
1235 
1236   } else {
1237     // If UseOsErrorReporting we call this for each level of the call stack
1238     // while searching for the exception handler.  Only the first level needs


1244     if (first_error_tid != mytid) {
1245       char msgbuf[64];
1246       jio_snprintf(msgbuf, sizeof(msgbuf),
1247                    "[thread " INTX_FORMAT " also had an error]",
1248                    mytid);
1249       out.print_raw_cr(msgbuf);
1250 
1251       // error reporting is not MT-safe, block current thread
1252       os::infinite_sleep();
1253 
1254     } else {
1255       if (recursive_error_count++ > 30) {
1256         out.print_raw_cr("[Too many errors, abort]");
1257         os::die();
1258       }
1259 
1260       outputStream* const st = log.is_open() ? &log : &out;
1261       st->cr();
1262 
1263       // Timeout handling.
1264       if (step_timeout) {
1265         // The current step had a timeout. Lets continue reporting with the next step.
1266         st->print_raw("[timeout occurred during error reporting in step \"");
1267         st->print_raw(_current_step_info);
1268         st->print_cr("\"] after " INT64_FORMAT " ms.", os::javaTimeMillis() - step_start_time);
1269       } else if (reporting_timeout) {
1270         // We hit ErrorLogTimeout. Reporting will stop altogether. Lets wrap things

1271         // up, the process is about to be stopped by the WatcherThread.
1272         st->print_cr("------ Timout during error reporting after " INT64_FORMAT "ms. ------",
1273           os::javaTimeMillis() - reporting_start_time);
1274         st->flush();
1275         // Watcherthread is about to call os::die. Lets just wait.
1276         os::infinite_sleep();
1277       } else {
1278         // Crash or assert during error reporting. Lets continue reporting with the next step.
1279         jio_snprintf(buffer, sizeof(buffer),
1280            "[error occurred during error reporting (%s), id 0x%x]",
1281                    _current_step_info, _id);
1282         st->print_raw_cr(buffer);
1283         st->cr();
1284       }
1285     }
1286   }
1287 
1288   // print to screen
1289   if (!out_done) {
1290     report(&out, false);
1291 
1292     out_done = true;
1293 


1476   } while (yes);
1477 }
1478 
1479 // Timeout handling: check if a timeout happened (either a single step did
1480 // timeout or the whole of error reporting hit ErrorLogTimeout). Interrupt
1481 // the reporting thread if that is the case.
1482 bool VMError::check_timeout() {
1483 
1484   if (ErrorLogTimeout == 0) {
1485     return false;
1486   }
1487 
1488   // Do not check for timeouts if we still have a message box to show to the
1489   // user or if there are OnError handlers to be run.
1490   if (ShowMessageBoxOnError
1491       || (OnError != NULL && OnError[0] != '\0')
1492       || Arguments::abort_hook() != NULL) {
1493     return false;
1494   }
1495 
1496   const jlong reporting_start_time_l = get_timestamp(&reporting_start_time);
1497   const jlong now = os::javaTimeMillis();

1498   if (reporting_start_time_l > 0) {
1499     const jlong end = reporting_start_time_l + ErrorLogTimeout * 1000;
1500     if (end <= now) {
1501       reporting_timeout = true;
1502       VMError::interrupt_reporting_thread();
1503       return true; // global timeout
1504     }
1505   }
1506 
1507   const jlong step_start_time_l = get_timestamp(&step_start_time);
1508   if (step_start_time_l > 0) {
1509     // A step times out after half of the total timeout. Steps are mostly fast unless they
1510     // hang for some reason, so this simple rule allows for one hanging step and still
1511     // leaves time enough for the rest of the steps to finish.
1512     const jlong end = step_start_time_l + ErrorLogTimeout * 1000 / 2;
1513     if (end <= now) {
1514       step_timeout = true;
1515       VMError::interrupt_reporting_thread();
1516       return false; // (Not a global timeout)
1517     }
1518   }
1519 
1520   return false;
1521 
1522 }
1523 
   1 /*
   2  * Copyright (c) 2003, 2017, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *


 296 // information that may be unsafe to get after a fatal error. If it happens,
 297 // you may find nested report_and_die() frames when you look at the stack
 298 // in a debugger.
 299 //
 300 // In general, a hang in error handler is much worse than a crash or internal
 301 // error, as it's harder to recover from a hang. Deadlock can happen if we
 302 // try to grab a lock that is already owned by current thread, or if the
 303 // owner is blocked forever (e.g. in os::infinite_sleep()). If possible, the
 304 // error handler and all the functions it called should avoid grabbing any
 305 // lock. An important thing to notice is that memory allocation needs a lock.
 306 //
 307 // We should avoid using large stack allocated buffers. Many errors happen
 308 // when stack space is already low. Making things even worse is that there
 309 // could be nested report_and_die() calls on stack (see above). Only one
 310 // thread can report error, so large buffers are statically allocated in data
 311 // segment.
 312 
 313 int          VMError::_current_step;
 314 const char*  VMError::_current_step_info;
 315 
 316 volatile jlong VMError::_reporting_start_time = -1;
 317 volatile bool VMError::_reporting_did_timeout = false;
 318 volatile jlong VMError::_step_start_time = -1;
 319 volatile bool VMError::_step_did_timeout = false;

 320 
 321 // Helper, return current timestamp for timeout handling.
 322 jlong VMError::get_current_timestamp() {
 323   return os::javaTimeNanos();





 324 }
 325 // Factor to translate the timestamp to seconds.
 326 #define TIMESTAMP_TO_SECONDS_FACTOR (1000 * 1000 * 1000)
 327 
 328 void VMError::record_reporting_start_time() {
 329   const jlong now = get_current_timestamp();
 330   Atomic::store(now, &_reporting_start_time);
 331 }
 332 
 333 jlong VMError::get_reporting_start_time() {
 334   return Atomic::load(&_reporting_start_time);
 335 }
 336 
 337 void VMError::record_step_start_time() {
 338   const jlong now = get_current_timestamp();
 339   Atomic::store(now, &_step_start_time);
 340 }
 341 
 342 jlong VMError::get_step_start_time() {
 343   return Atomic::load(&_step_start_time);
 344 }
 345 
 346 void VMError::report(outputStream* st, bool _verbose) {
 347 
 348 # define BEGIN if (_current_step == 0) { _current_step = __LINE__;
 349 # define STEP(s) } if (_current_step < __LINE__) { _current_step = __LINE__; _current_step_info = s; \
 350   record_step_start_time(); _step_did_timeout = false;
 351 # define END }
 352 
 353   // don't allocate large buffer on stack
 354   static char buf[O_BUFLEN];
 355 
 356   BEGIN
 357 
 358   STEP("printing fatal error message")
 359 
 360     st->print_cr("#");
 361     if (should_report_bug(_id)) {
 362       st->print_cr("# A fatal error has been detected by the Java Runtime Environment:");
 363     } else {
 364       st->print_cr("# There is insufficient memory for the Java "
 365                    "Runtime Environment to continue.");
 366     }
 367 
 368 #ifndef PRODUCT
 369   // Error handler self tests
 370 
 371   // test secondary error handling. Test it twice, to test that resetting
 372   // error handler after a secondary crash works.
 373   STEP("test secondary crash 1")
 374     if (_verbose && TestCrashInErrorHandler != 0) {
 375       st->print_cr("Will crash now (TestCrashInErrorHandler=" UINTX_FORMAT ")...",
 376         TestCrashInErrorHandler);
 377       controlled_crash(TestCrashInErrorHandler);
 378     }
 379 
 380   STEP("test secondary crash 2")
 381     if (_verbose && TestCrashInErrorHandler != 0) {
 382       st->print_cr("Will crash now (TestCrashInErrorHandler=" UINTX_FORMAT ")...",
 383         TestCrashInErrorHandler);
 384       controlled_crash(TestCrashInErrorHandler);
 385     }
 386 
 387   // TestUnresponsiveErrorHandler: We want to test both step timeouts and global timeout.
 388   // Step to global timeout ratio is 4:1, so in order to be absolutely sure we hit the
 389   // global timeout, lets execute the timeout step five times.
 390   // See corresponding test in test/runtime/ErrorHandling/TimeoutInErrorHandlingTest.java
 391   #define TIMEOUT_TEST_STEP STEP("test unresponsive error reporting step") \




 392     if (_verbose && TestUnresponsiveErrorHandler) { os::infinite_sleep(); }
 393   TIMEOUT_TEST_STEP
 394   TIMEOUT_TEST_STEP
 395   TIMEOUT_TEST_STEP
 396   TIMEOUT_TEST_STEP
 397   TIMEOUT_TEST_STEP
 398 
 399   STEP("test safefetch in error handler")
 400     // test whether it is safe to use SafeFetch32 in Crash Handler. Test twice
 401     // to test that resetting the signal handler works correctly.
 402     if (_verbose && TestSafeFetchInErrorHandler) {
 403       st->print_cr("Will test SafeFetch...");
 404       if (CanUseSafeFetch32()) {
 405         int* const invalid_pointer = (int*) get_segfault_address();
 406         const int x = 0x76543210;
 407         int i1 = SafeFetch32(invalid_pointer, x);
 408         int i2 = SafeFetch32(invalid_pointer, x);
 409         if (i1 == x && i2 == x) {
 410           st->print_cr("SafeFetch OK."); // Correctly deflected and returned default pattern
 411         } else {
 412           st->print_cr("??");
 413         }
 414       } else {
 415         st->print_cr("not possible; skipped.");
 416       }
 417     }


1204 
1205     // Initialize time stamps to use the same base.
1206     out.time_stamp().update_to(1);
1207     log.time_stamp().update_to(1);
1208 
1209     _id = id;
1210     _message = message;
1211     _thread = thread;
1212     _pc = pc;
1213     _siginfo = siginfo;
1214     _context = context;
1215     _filename = filename;
1216     _lineno = lineno;
1217     _size = size;
1218     jio_vsnprintf(_detail_msg, sizeof(_detail_msg), detail_fmt, detail_args);
1219 
1220     // first time
1221     set_error_reported();
1222 
1223     reporting_started();
1224     record_reporting_start_time();
1225 
1226     if (ShowMessageBoxOnError || PauseAtExit) {
1227       show_message_box(buffer, sizeof(buffer));
1228 
1229       // User has asked JVM to abort. Reset ShowMessageBoxOnError so the
1230       // WatcherThread can kill JVM if the error handler hangs.
1231       ShowMessageBoxOnError = false;
1232     }
1233 
1234     os::check_dump_limit(buffer, sizeof(buffer));
1235 
1236     // reset signal handlers or exception filter; make sure recursive crashes
1237     // are handled properly.
1238     reset_signal_handlers();
1239 
1240     TRACE_VM_ERROR();
1241 
1242   } else {
1243     // If UseOsErrorReporting we call this for each level of the call stack
1244     // while searching for the exception handler.  Only the first level needs


1250     if (first_error_tid != mytid) {
1251       char msgbuf[64];
1252       jio_snprintf(msgbuf, sizeof(msgbuf),
1253                    "[thread " INTX_FORMAT " also had an error]",
1254                    mytid);
1255       out.print_raw_cr(msgbuf);
1256 
1257       // error reporting is not MT-safe, block current thread
1258       os::infinite_sleep();
1259 
1260     } else {
1261       if (recursive_error_count++ > 30) {
1262         out.print_raw_cr("[Too many errors, abort]");
1263         os::die();
1264       }
1265 
1266       outputStream* const st = log.is_open() ? &log : &out;
1267       st->cr();
1268 
1269       // Timeout handling.
1270       if (_step_did_timeout) {
1271         // The current step had a timeout. Lets continue reporting with the next step.
1272         st->print_raw("[timeout occurred during error reporting in step \"");
1273         st->print_raw(_current_step_info);
1274         st->print_cr("\"] after " INT64_FORMAT " s.",
1275           (get_current_timestamp() - _step_start_time) / TIMESTAMP_TO_SECONDS_FACTOR);
1276       } else if (_reporting_did_timeout) {
1277         // We hit ErrorLogTimeout. Reporting will stop altogether. Let's wrap things
1278         // up, the process is about to be stopped by the WatcherThread.
1279         st->print_cr("------ Timeout during error reporting after " INT64_FORMAT " s. ------",
1280           (get_current_timestamp() - _reporting_start_time) / TIMESTAMP_TO_SECONDS_FACTOR);
1281         st->flush();
1282         // Watcherthread is about to call os::die. Lets just wait.
1283         os::infinite_sleep();
1284       } else {
1285         // Crash or assert during error reporting. Lets continue reporting with the next step.
1286         jio_snprintf(buffer, sizeof(buffer),
1287            "[error occurred during error reporting (%s), id 0x%x]",
1288                    _current_step_info, _id);
1289         st->print_raw_cr(buffer);
1290         st->cr();
1291       }
1292     }
1293   }
1294 
1295   // print to screen
1296   if (!out_done) {
1297     report(&out, false);
1298 
1299     out_done = true;
1300 


1483   } while (yes);
1484 }
1485 
1486 // Timeout handling: check if a timeout happened (either a single step did
1487 // timeout or the whole of error reporting hit ErrorLogTimeout). Interrupt
1488 // the reporting thread if that is the case.
1489 bool VMError::check_timeout() {
1490 
1491   if (ErrorLogTimeout == 0) {
1492     return false;
1493   }
1494 
1495   // Do not check for timeouts if we still have a message box to show to the
1496   // user or if there are OnError handlers to be run.
1497   if (ShowMessageBoxOnError
1498       || (OnError != NULL && OnError[0] != '\0')
1499       || Arguments::abort_hook() != NULL) {
1500     return false;
1501   }
1502 
1503   const jlong reporting_start_time_l = get_reporting_start_time();
1504   const jlong now = get_current_timestamp();
1505   // Timestamp is stored in nanos.
1506   if (reporting_start_time_l > 0) {
1507     const jlong end = reporting_start_time_l + (jlong)ErrorLogTimeout * TIMESTAMP_TO_SECONDS_FACTOR;
1508     if (end <= now) {
1509       _reporting_did_timeout = true;
1510       interrupt_reporting_thread();
1511       return true; // global timeout
1512     }
1513   }
1514 
1515   const jlong step_start_time_l = get_step_start_time();
1516   if (step_start_time_l > 0) {
1517     // A step times out after a quarter of the total timeout. Steps are mostly fast unless they
1518     // hang for some reason, so this simple rule allows for three hanging step and still
1519     // hopefully leaves time enough for the rest of the steps to finish.
1520     const jlong end = step_start_time_l + (jlong)ErrorLogTimeout * TIMESTAMP_TO_SECONDS_FACTOR / 4;
1521     if (end <= now) {
1522       _step_did_timeout = true;
1523       interrupt_reporting_thread();
1524       return false; // (Not a global timeout)
1525     }
1526   }
1527 
1528   return false;
1529 
1530 }
1531 
< prev index next >