hotspot Sdiff src/share/vm/utilities

src/share/vm/utilities/vmError.cpp

rev 12487 : 8166944: Hanging Error Reporting steps may lead to torn error logs.
Reviewed-by: cjplummer, dholmes
Summary: Interupt error reporting if reporting steps hang to enable subsequent reporting steps to run.

  21  * questions.
  22  *
  23  */
  24 
  25 #include <fcntl.h>
  26 #include "precompiled.hpp"
  27 #include "code/codeCache.hpp"
  28 #include "compiler/compileBroker.hpp"
  29 #include "compiler/disassembler.hpp"
  30 #include "gc/shared/collectedHeap.hpp"
  31 #include "logging/logConfiguration.hpp"
  32 #include "prims/whitebox.hpp"
  33 #include "runtime/arguments.hpp"
  34 #include "runtime/atomic.hpp"
  35 #include "runtime/frame.inline.hpp"
  36 #include "runtime/init.hpp"
  37 #include "runtime/os.hpp"
  38 #include "runtime/thread.inline.hpp"
  39 #include "runtime/vmThread.hpp"
  40 #include "runtime/vm_operations.hpp"

  41 #include "services/memTracker.hpp"
  42 #include "trace/traceMacros.hpp"
  43 #include "utilities/debug.hpp"
  44 #include "utilities/decoder.hpp"
  45 #include "utilities/defaultStream.hpp"
  46 #include "utilities/errorReporter.hpp"
  47 #include "utilities/events.hpp"
  48 #include "utilities/vmError.hpp"
  49 
  50 // List of environment variables that should be reported in error log file.
  51 const char *env_list[] = {
  52   // All platforms
  53   "JAVA_HOME", "JRE_HOME", "JAVA_TOOL_OPTIONS", "_JAVA_OPTIONS", "CLASSPATH",
  54   "JAVA_COMPILER", "PATH", "USERNAME",
  55 
  56   // Env variables that are defined on Solaris/Linux/BSD
  57   "LD_LIBRARY_PATH", "LD_PRELOAD", "SHELL", "DISPLAY",
  58   "HOSTTYPE", "OSTYPE", "ARCH", "MACHTYPE",
  59 
  60   // defined on Linux

 295 // information that may be unsafe to get after a fatal error. If it happens,
 296 // you may find nested report_and_die() frames when you look at the stack
 297 // in a debugger.
 298 //
 299 // In general, a hang in error handler is much worse than a crash or internal
 300 // error, as it's harder to recover from a hang. Deadlock can happen if we
 301 // try to grab a lock that is already owned by current thread, or if the
 302 // owner is blocked forever (e.g. in os::infinite_sleep()). If possible, the
 303 // error handler and all the functions it called should avoid grabbing any
 304 // lock. An important thing to notice is that memory allocation needs a lock.
 305 //
 306 // We should avoid using large stack allocated buffers. Many errors happen
 307 // when stack space is already low. Making things even worse is that there
 308 // could be nested report_and_die() calls on stack (see above). Only one
 309 // thread can report error, so large buffers are statically allocated in data
 310 // segment.
 311 
 312 int          VMError::_current_step;
 313 const char*  VMError::_current_step_info;
 314 

























 315 void VMError::report(outputStream* st, bool _verbose) {
 316 
 317 # define BEGIN if (_current_step == 0) { _current_step = __LINE__;
 318 # define STEP(s) } if (_current_step < __LINE__) { _current_step = __LINE__; _current_step_info = s;

 319 # define END }
 320 
 321   // don't allocate large buffer on stack
 322   static char buf[O_BUFLEN];
 323 
 324   BEGIN
 325 
 326   STEP("printing fatal error message")
 327 
 328     st->print_cr("#");
 329     if (should_report_bug(_id)) {
 330       st->print_cr("# A fatal error has been detected by the Java Runtime Environment:");
 331     } else {
 332       st->print_cr("# There is insufficient memory for the Java "
 333                    "Runtime Environment to continue.");
 334     }
 335 
 336 #ifndef PRODUCT
 337   // Error handler self tests
 338 
 339   // test secondary error handling. Test it twice, to test that resetting
 340   // error handler after a secondary crash works.
 341   STEP("test secondary crash 1")
 342     if (_verbose && TestCrashInErrorHandler != 0) {
 343       st->print_cr("Will crash now (TestCrashInErrorHandler=" UINTX_FORMAT ")...",
 344         TestCrashInErrorHandler);
 345       controlled_crash(TestCrashInErrorHandler);
 346     }
 347 
 348   STEP("test secondary crash 2")
 349     if (_verbose && TestCrashInErrorHandler != 0) {
 350       st->print_cr("Will crash now (TestCrashInErrorHandler=" UINTX_FORMAT ")...",
 351         TestCrashInErrorHandler);
 352       controlled_crash(TestCrashInErrorHandler);
 353     }
 354 











 355   STEP("test safefetch in error handler")
 356     // test whether it is safe to use SafeFetch32 in Crash Handler. Test twice
 357     // to test that resetting the signal handler works correctly.
 358     if (_verbose && TestSafeFetchInErrorHandler) {
 359       st->print_cr("Will test SafeFetch...");
 360       if (CanUseSafeFetch32()) {
 361         int* const invalid_pointer = (int*) get_segfault_address();
 362         const int x = 0x76543210;
 363         int i1 = SafeFetch32(invalid_pointer, x);
 364         int i2 = SafeFetch32(invalid_pointer, x);
 365         if (i1 == x && i2 == x) {
 366           st->print_cr("SafeFetch OK."); // Correctly deflected and returned default pattern
 367         } else {
 368           st->print_cr("??");
 369         }
 370       } else {
 371         st->print_cr("not possible; skipped.");
 372       }
 373     }
 374 #endif // PRODUCT

1159       Atomic::cmpxchg_ptr(mytid, &first_error_tid, -1) == -1) {
1160 
1161     // Initialize time stamps to use the same base.
1162     out.time_stamp().update_to(1);
1163     log.time_stamp().update_to(1);
1164 
1165     _id = id;
1166     _message = message;
1167     _thread = thread;
1168     _pc = pc;
1169     _siginfo = siginfo;
1170     _context = context;
1171     _filename = filename;
1172     _lineno = lineno;
1173     _size = size;
1174     jio_vsnprintf(_detail_msg, sizeof(_detail_msg), detail_fmt, detail_args);
1175 
1176     // first time
1177     set_error_reported();
1178 



1179     if (ShowMessageBoxOnError || PauseAtExit) {
1180       show_message_box(buffer, sizeof(buffer));
1181 
1182       // User has asked JVM to abort. Reset ShowMessageBoxOnError so the
1183       // WatcherThread can kill JVM if the error handler hangs.
1184       ShowMessageBoxOnError = false;
1185     }
1186 
1187     os::check_dump_limit(buffer, sizeof(buffer));
1188 
1189     // reset signal handlers or exception filter; make sure recursive crashes
1190     // are handled properly.
1191     reset_signal_handlers();
1192 
1193     TRACE_VM_ERROR();
1194 
1195   } else {
1196     // If UseOsErrorReporting we call this for each level of the call stack
1197     // while searching for the exception handler.  Only the first level needs
1198     // to be reported.
1199     if (UseOSErrorReporting && log_done) return;
1200 
1201     // This is not the first error, see if it happened in a different thread
1202     // or in the same thread during error reporting.
1203     if (first_error_tid != mytid) {
1204       char msgbuf[64];
1205       jio_snprintf(msgbuf, sizeof(msgbuf),
1206                    "[thread " INTX_FORMAT " also had an error]",
1207                    mytid);
1208       out.print_raw_cr(msgbuf);
1209 
1210       // error reporting is not MT-safe, block current thread
1211       os::infinite_sleep();
1212 
1213     } else {
1214       if (recursive_error_count++ > 30) {
1215         out.print_raw_cr("[Too many errors, abort]");
1216         os::die();
1217       }
1218 



















1219       jio_snprintf(buffer, sizeof(buffer),
1220                    "[error occurred during error reporting (%s), id 0x%x]",
1221                    _current_step_info, _id);
1222       if (log.is_open()) {
1223         log.cr();
1224         log.print_raw_cr(buffer);
1225         log.cr();
1226       } else {
1227         out.cr();
1228         out.print_raw_cr(buffer);
1229         out.cr();
1230       }
1231     }
1232   }
1233 
1234   // print to screen
1235   if (!out_done) {
1236     report(&out, false);
1237 
1238     out_done = true;
1239 
1240     _current_step = 0;
1241     _current_step_info = "";
1242   }
1243 
1244   // print to error log file
1245   if (!log_done) {
1246     // see if log file is already open
1247     if (!log.is_open()) {
1248       // open log file
1249       int fd = prepare_log_file(ErrorFile, "hs_err_pid%p.log", buffer, sizeof(buffer));

1404                      os::strerror(errno), os::errno_name(errno), errno);
1405     }
1406   }
1407 }
1408 
1409 void VMError::report_java_out_of_memory(const char* message) {
1410   if (OnOutOfMemoryError && OnOutOfMemoryError[0]) {
1411     MutexLocker ml(Heap_lock);
1412     VM_ReportJavaOutOfMemory op(message);
1413     VMThread::execute(&op);
1414   }
1415 }
1416 
1417 void VMError::show_message_box(char *buf, int buflen) {
1418   bool yes;
1419   do {
1420     error_string(buf, buflen);
1421     yes = os::start_debugging(buf,buflen);
1422   } while (yes);
1423 }

  21  * questions.
  22  *
  23  */
  24 
  25 #include <fcntl.h>
  26 #include "precompiled.hpp"
  27 #include "code/codeCache.hpp"
  28 #include "compiler/compileBroker.hpp"
  29 #include "compiler/disassembler.hpp"
  30 #include "gc/shared/collectedHeap.hpp"
  31 #include "logging/logConfiguration.hpp"
  32 #include "prims/whitebox.hpp"
  33 #include "runtime/arguments.hpp"
  34 #include "runtime/atomic.hpp"
  35 #include "runtime/frame.inline.hpp"
  36 #include "runtime/init.hpp"
  37 #include "runtime/os.hpp"
  38 #include "runtime/thread.inline.hpp"
  39 #include "runtime/vmThread.hpp"
  40 #include "runtime/vm_operations.hpp"
  41 #include "runtime/vm_version.hpp"
  42 #include "services/memTracker.hpp"
  43 #include "trace/traceMacros.hpp"
  44 #include "utilities/debug.hpp"
  45 #include "utilities/decoder.hpp"
  46 #include "utilities/defaultStream.hpp"
  47 #include "utilities/errorReporter.hpp"
  48 #include "utilities/events.hpp"
  49 #include "utilities/vmError.hpp"
  50 
  51 // List of environment variables that should be reported in error log file.
  52 const char *env_list[] = {
  53   // All platforms
  54   "JAVA_HOME", "JRE_HOME", "JAVA_TOOL_OPTIONS", "_JAVA_OPTIONS", "CLASSPATH",
  55   "JAVA_COMPILER", "PATH", "USERNAME",
  56 
  57   // Env variables that are defined on Solaris/Linux/BSD
  58   "LD_LIBRARY_PATH", "LD_PRELOAD", "SHELL", "DISPLAY",
  59   "HOSTTYPE", "OSTYPE", "ARCH", "MACHTYPE",
  60 
  61   // defined on Linux

 296 // information that may be unsafe to get after a fatal error. If it happens,
 297 // you may find nested report_and_die() frames when you look at the stack
 298 // in a debugger.
 299 //
 300 // In general, a hang in error handler is much worse than a crash or internal
 301 // error, as it's harder to recover from a hang. Deadlock can happen if we
 302 // try to grab a lock that is already owned by current thread, or if the
 303 // owner is blocked forever (e.g. in os::infinite_sleep()). If possible, the
 304 // error handler and all the functions it called should avoid grabbing any
 305 // lock. An important thing to notice is that memory allocation needs a lock.
 306 //
 307 // We should avoid using large stack allocated buffers. Many errors happen
 308 // when stack space is already low. Making things even worse is that there
 309 // could be nested report_and_die() calls on stack (see above). Only one
 310 // thread can report error, so large buffers are statically allocated in data
 311 // segment.
 312 
 313 int          VMError::_current_step;
 314 const char*  VMError::_current_step_info;
 315 
 316 // Timeout handling.
 317 static volatile jlong reporting_start_time = -1;
 318 static volatile bool reporting_timeout = false;
 319 static volatile jlong step_start_time = -1;
 320 static volatile bool step_timeout = false;
 321 
 322 // Helper function, sets a timestamp to the current time.
 323 static void set_to_now(volatile jlong* p) {
 324   const jlong now = os::javaTimeMillis();
 325   if (VM_Version::supports_cx8()) {
 326     Atomic::store(now, p);
 327   } else {
 328     *p = now;
 329   }
 330 }
 331 
 332 // helper function, returns value of time stamp.
 333 static jlong get_timestamp(volatile jlong* p) {
 334   if (VM_Version::supports_cx8()) {
 335     return Atomic::load(p);
 336   } else {
 337     return *p;
 338   }
 339 }
 340 
 341 void VMError::report(outputStream* st, bool _verbose) {
 342 
 343 # define BEGIN if (_current_step == 0) { _current_step = __LINE__;
 344 # define STEP(s) } if (_current_step < __LINE__) { _current_step = __LINE__; _current_step_info = s; \
 345   set_to_now(&step_start_time); step_timeout = false;
 346 # define END }
 347 
 348   // don't allocate large buffer on stack
 349   static char buf[O_BUFLEN];
 350 
 351   BEGIN
 352 
 353   STEP("printing fatal error message")
 354 
 355     st->print_cr("#");
 356     if (should_report_bug(_id)) {
 357       st->print_cr("# A fatal error has been detected by the Java Runtime Environment:");
 358     } else {
 359       st->print_cr("# There is insufficient memory for the Java "
 360                    "Runtime Environment to continue.");
 361     }
 362 
 363 #ifndef PRODUCT
 364   // Error handler self tests
 365 
 366   // test secondary error handling. Test it twice, to test that resetting
 367   // error handler after a secondary crash works.
 368   STEP("test secondary crash 1")
 369     if (_verbose && TestCrashInErrorHandler != 0) {
 370       st->print_cr("Will crash now (TestCrashInErrorHandler=" UINTX_FORMAT ")...",
 371         TestCrashInErrorHandler);
 372       controlled_crash(TestCrashInErrorHandler);
 373     }
 374 
 375   STEP("test secondary crash 2")
 376     if (_verbose && TestCrashInErrorHandler != 0) {
 377       st->print_cr("Will crash now (TestCrashInErrorHandler=" UINTX_FORMAT ")...",
 378         TestCrashInErrorHandler);
 379       controlled_crash(TestCrashInErrorHandler);
 380     }
 381 
 382   // TestUnresponsiveErrorHandler: three times to trigger first a step timeout, then
 383   // the global error reporting timeout.
 384   STEP("test unresponsive error reporting step 1")
 385     if (_verbose && TestUnresponsiveErrorHandler) { os::infinite_sleep(); }
 386 
 387   STEP("test unresponsive error reporting step 2")
 388     if (_verbose && TestUnresponsiveErrorHandler) { os::infinite_sleep(); }
 389 
 390   STEP("test unresponsive error reporting step 3")
 391     if (_verbose && TestUnresponsiveErrorHandler) { os::infinite_sleep(); }
 392 
 393   STEP("test safefetch in error handler")
 394     // test whether it is safe to use SafeFetch32 in Crash Handler. Test twice
 395     // to test that resetting the signal handler works correctly.
 396     if (_verbose && TestSafeFetchInErrorHandler) {
 397       st->print_cr("Will test SafeFetch...");
 398       if (CanUseSafeFetch32()) {
 399         int* const invalid_pointer = (int*) get_segfault_address();
 400         const int x = 0x76543210;
 401         int i1 = SafeFetch32(invalid_pointer, x);
 402         int i2 = SafeFetch32(invalid_pointer, x);
 403         if (i1 == x && i2 == x) {
 404           st->print_cr("SafeFetch OK."); // Correctly deflected and returned default pattern
 405         } else {
 406           st->print_cr("??");
 407         }
 408       } else {
 409         st->print_cr("not possible; skipped.");
 410       }
 411     }
 412 #endif // PRODUCT

1197       Atomic::cmpxchg_ptr(mytid, &first_error_tid, -1) == -1) {
1198 
1199     // Initialize time stamps to use the same base.
1200     out.time_stamp().update_to(1);
1201     log.time_stamp().update_to(1);
1202 
1203     _id = id;
1204     _message = message;
1205     _thread = thread;
1206     _pc = pc;
1207     _siginfo = siginfo;
1208     _context = context;
1209     _filename = filename;
1210     _lineno = lineno;
1211     _size = size;
1212     jio_vsnprintf(_detail_msg, sizeof(_detail_msg), detail_fmt, detail_args);
1213 
1214     // first time
1215     set_error_reported();
1216 
1217     reporting_started();
1218     set_to_now(&reporting_start_time);
1219 
1220     if (ShowMessageBoxOnError || PauseAtExit) {
1221       show_message_box(buffer, sizeof(buffer));
1222 
1223       // User has asked JVM to abort. Reset ShowMessageBoxOnError so the
1224       // WatcherThread can kill JVM if the error handler hangs.
1225       ShowMessageBoxOnError = false;
1226     }
1227 
1228     os::check_dump_limit(buffer, sizeof(buffer));
1229 
1230     // reset signal handlers or exception filter; make sure recursive crashes
1231     // are handled properly.
1232     reset_signal_handlers();
1233 
1234     TRACE_VM_ERROR();
1235 
1236   } else {
1237     // If UseOsErrorReporting we call this for each level of the call stack
1238     // while searching for the exception handler.  Only the first level needs
1239     // to be reported.
1240     if (UseOSErrorReporting && log_done) return;
1241 
1242     // This is not the first error, see if it happened in a different thread
1243     // or in the same thread during error reporting.
1244     if (first_error_tid != mytid) {
1245       char msgbuf[64];
1246       jio_snprintf(msgbuf, sizeof(msgbuf),
1247                    "[thread " INTX_FORMAT " also had an error]",
1248                    mytid);
1249       out.print_raw_cr(msgbuf);
1250 
1251       // error reporting is not MT-safe, block current thread
1252       os::infinite_sleep();
1253 
1254     } else {
1255       if (recursive_error_count++ > 30) {
1256         out.print_raw_cr("[Too many errors, abort]");
1257         os::die();
1258       }
1259 
1260       outputStream* const st = log.is_open() ? &log : &out;
1261       st->cr();
1262 
1263       // Timeout handling.
1264       if (step_timeout) {
1265         // The current step had a timeout. Lets continue reporting with the next step.
1266         st->print_raw("[timeout occurred during error reporting in step \"");
1267         st->print_raw(_current_step_info);
1268         st->print_cr("\"] after " INT64_FORMAT " ms.", os::javaTimeMillis() - step_start_time);
1269       } else if (reporting_timeout) {
1270         // We hit ErrorLogTimeout. Reporting will stop altogether. Lets wrap things
1271         // up, the process is about to be stopped by the WatcherThread.
1272         st->print_cr("------ Timout during error reporting after " INT64_FORMAT "ms. ------",
1273           os::javaTimeMillis() - reporting_start_time);
1274         st->flush();
1275         // Watcherthread is about to call os::die. Lets just wait.
1276         os::infinite_sleep();
1277       } else {
1278         // Crash or assert during error reporting. Lets continue reporting with the next step.
1279         jio_snprintf(buffer, sizeof(buffer),
1280            "[error occurred during error reporting (%s), id 0x%x]",
1281                    _current_step_info, _id);
1282         st->print_raw_cr(buffer);
1283         st->cr();






1284       }
1285     }
1286   }
1287 
1288   // print to screen
1289   if (!out_done) {
1290     report(&out, false);
1291 
1292     out_done = true;
1293 
1294     _current_step = 0;
1295     _current_step_info = "";
1296   }
1297 
1298   // print to error log file
1299   if (!log_done) {
1300     // see if log file is already open
1301     if (!log.is_open()) {
1302       // open log file
1303       int fd = prepare_log_file(ErrorFile, "hs_err_pid%p.log", buffer, sizeof(buffer));

1458                      os::strerror(errno), os::errno_name(errno), errno);
1459     }
1460   }
1461 }
1462 
1463 void VMError::report_java_out_of_memory(const char* message) {
1464   if (OnOutOfMemoryError && OnOutOfMemoryError[0]) {
1465     MutexLocker ml(Heap_lock);
1466     VM_ReportJavaOutOfMemory op(message);
1467     VMThread::execute(&op);
1468   }
1469 }
1470 
1471 void VMError::show_message_box(char *buf, int buflen) {
1472   bool yes;
1473   do {
1474     error_string(buf, buflen);
1475     yes = os::start_debugging(buf,buflen);
1476   } while (yes);
1477 }
1478 
1479 // Timeout handling: check if a timeout happened (either a single step did
1480 // timeout or the whole of error reporting hit ErrorLogTimeout). Interrupt
1481 // the reporting thread if that is the case.
1482 bool VMError::check_timeout() {
1483 
1484   if (ErrorLogTimeout == 0) {
1485     return false;
1486   }
1487 
1488   // Do not check for timeouts if we still have a message box to show to the
1489   // user or if there are OnError handlers to be run.
1490   if (ShowMessageBoxOnError
1491       || (OnError != NULL && OnError[0] != '\0')
1492       || Arguments::abort_hook() != NULL) {
1493     return false;
1494   }
1495 
1496   const jlong reporting_start_time_l = get_timestamp(&reporting_start_time);
1497   const jlong now = os::javaTimeMillis();
1498   if (reporting_start_time_l > 0) {
1499     const jlong end = reporting_start_time_l + ErrorLogTimeout * 1000;
1500     if (end <= now) {
1501       reporting_timeout = true;
1502       VMError::interrupt_reporting_thread();
1503       return true; // global timeout
1504     }
1505   }
1506 
1507   const jlong step_start_time_l = get_timestamp(&step_start_time);
1508   if (step_start_time_l > 0) {
1509     // A step times out after half of the total timeout. Steps are mostly fast unless they
1510     // hang for some reason, so this simple rule allows for one hanging step and still
1511     // leaves time enough for the rest of the steps to finish.
1512     const jlong end = step_start_time_l + ErrorLogTimeout * 1000 / 2;
1513     if (end <= now) {
1514       step_timeout = true;
1515       VMError::interrupt_reporting_thread();
1516       return false; // (Not a global timeout)
1517     }
1518   }
1519 
1520   return false;
1521 
1522 }
1523

< prev index next >