/** Author: Robert Crocombe Classification: Unclassified Initial Release Date: Mon Dec 11 10:34:38 MST 2006 (c) 2006 Raytheon Missile Systems. Causes 1394 bus resets until a 1394 port disappears. Prerequisites: 1) Linux kernel support a) Enable (using 'make menuconfig' or similar) Device Drivers -> IEEE 1394 (FireWire) support -> IEEE1394 (FireWire Support) <- as a module [*] Excessive debugging output OHCI-1394 support Raw IEEE1394 I/O support Your .config file should thereby look similar to the following: # # IEEE 1394 (FireWire) support # CONFIG_IEEE1394=m # # Subsystem Options # CONFIG_IEEE1394_VERBOSEDEBUG=y CONFIG_IEEE1394_OUI_DB=y # CONFIG_IEEE1394_EXTRA_CONFIG_ROMS is not set # CONFIG_IEEE1394_EXPORT_FULL_API is not set # # Device Drivers # # # Texas Instruments PCILynx requires I2C # CONFIG_IEEE1394_OHCI1394=m # # Protocol Drivers # # CONFIG_IEEE1394_VIDEO1394 is not set # CONFIG_IEEE1394_SBP2 is not set # CONFIG_IEEE1394_ETH1394 is not set # CONFIG_IEEE1394_DV1394 is not set CONFIG_IEEE1394_RAWIO=m 2) Linux library support a) Install libraw1394-1.2.1 from http://www.linux1394.org/ 3) Excessive debugging output goes to /var/log/messages -- elsewise change "the_file" below 4) It seems to be much easier to trigger the problematic behavior under the -rt kernels (http://people.redhat.com/~mingo/realtime-preempt/): we use 2.6.16-rt29, which is the most recent that we can compile. It does happen under the other kernels, though. After a port disappears, you can use: rmmod raw1394 ; rmmod ohci1394 to fix things up. Then after a: modprobe ohci1394 ; modprobe raw1394 you can run again to re-break things. */ #include #include #include #include #include #include #include #include static const char * const the_file = "/var/log/messages"; static const char * const NEEDLE = "fw-host"; static volatile sig_atomic_t stop; enum { BUF_SIZE = 4 << 10, MAX_PORTS = 8, DEFAULT_PORT = 0, GONE_TRIGGER = 10 }; //////////////////////////////////////////////////////////////////////////////// // Prototypes //////////////////////////////////////////////////////////////////////////////// void signal_handler(int signum, siginfo_t *info, void *p); char * signum_to_string(int signum); int do_the_thing(FILE *file_p); //////////////////////////////////////////////////////////////////////////////// // Definitions //////////////////////////////////////////////////////////////////////////////// /** Yar. Super sig handler to the rescue. */ void signal_handler(int signum, siginfo_t *info, void *p) { printf("Caught signal %d: %s -- ", signum, signum_to_string(signum)); if (!info) { printf("NULL siginfo_t pointer!\n"); if ((signum == SIGSEGV) || (signum == SIGBUS)) { printf("Attempting dirty shutdown *NOW*\n"); kill(0, SIGINT); exit(0); } printf("Attempting synchronous termination\n"); stop = 1; return; } else if (info->si_errno != 0) printf("si_errno is non-zero: %d\n", info->si_errno); else printf("no error detected\n"); switch(info->si_code) { case SI_QUEUE: // sent via sigqueue() printf("Sigqueue: pid is %d from user %d: value is %d\n", info->si_pid, info->si_uid, info->si_value.sival_int); break; case SI_USER: // sent via kill() or sigsend() printf("Signal sent from pid %d and user %d\n", info->si_pid, info->si_uid); // FALL THROUGH! default: // not generated by user -- but reason is encoded switch(signum) { case SIGINT: printf("SIGINT received: synchronous shutdown.\n"); stop = 1; break; case SIGSEGV: printf("SIGSEGV: segfault @ %12llx", (unsigned long long)info->si_addr); switch(info->si_code) { case SEGV_MAPERR: printf(": address not mapped to object.\n"); break; case SEGV_ACCERR: printf(": invalid permissions for mapped object.\n"); break; default: /* Regular generic segfault */ printf("\n"); } printf("Attempting dirty shutdown *NOW*\n"); // send SIGINT signal to all processes in the process group. kill(0, SIGINT); exit(0); break; case SIGBUS: // cannot return from signal handler with SIGBUS, because the // processor's PC is pointing to the instruction where the fault // occurs, so it'd simply fault again. So exit() and hope for // the best. printf("SIGBUS: memory fault @ %016llx\n", (unsigned long long)info->si_addr); printf("Attempting dirty shutdown *NOW*\n"); // send SIGINT signal to all processes in the process group. kill(0, SIGINT); exit(0); case SIGUSR1: printf("SIGUSR1: what?\n"); break; case SIGUSR2: printf("SIGUSR2: what?\n"); break; case SIGCHLD: printf("SIGCHLD: pid is %d, status is %d: ", info->si_pid, info->si_status); switch(info->si_code) // yes, switch on si_code again { case(CLD_EXITED): printf("Child exited.\n"); break; case(CLD_KILLED): printf("Child killed.\n"); break; case(CLD_DUMPED): // Wed Mar 16 17:51:14 MST 2005 // This seems to indicate that we're screwed and // should abort the run, so I'll set stop. printf("Child terminated abnormally (and dumped core)).\n"); stop = 1; break; case(CLD_TRAPPED): printf("Child trapped while tracing.\n"); break; case(CLD_STOPPED): printf("Child stopped.\n"); break; case(CLD_CONTINUED): printf("Child is continuing.\n"); break; default: printf("Yow: unknown child status of %d\n", info->si_code); } // SIGCHLD when not already terminating? Oooh, we're probably // boned. if (!stop) { printf("SIGCHLD when not terminating: boned?\n"); printf("Attempting dirty shutdown *NOW*\n"); // send SIGINT signal to all processes in the process group. kill(0, SIGINT); exit(0); } break; default: printf("Caught unhandled signal?\n"); } } } /** Linuxy flavor: somewhat fewer values than IRIXy flavor. */ char * signum_to_string(int signum) { #define DO_ME(a) case (a): return #a switch(signum) { DO_ME(SIGHUP); DO_ME(SIGINT); DO_ME(SIGQUIT); DO_ME(SIGILL); DO_ME(SIGTRAP); DO_ME(SIGABRT); DO_ME(SIGFPE); DO_ME(SIGKILL); DO_ME(SIGBUS); DO_ME(SIGSEGV); DO_ME(SIGSYS); DO_ME(SIGPIPE); DO_ME(SIGALRM); DO_ME(SIGTERM); DO_ME(SIGUSR1); DO_ME(SIGUSR2); DO_ME(SIGCHLD); DO_ME(SIGPWR); DO_ME(SIGWINCH); DO_ME(SIGURG); DO_ME(SIGPOLL); DO_ME(SIGSTOP); DO_ME(SIGTSTP); DO_ME(SIGCONT); DO_ME(SIGTTIN); DO_ME(SIGTTOU); DO_ME(SIGVTALRM); DO_ME(SIGPROF); DO_ME(SIGXCPU); DO_ME(SIGXFSZ); default: return "Unknown signal number!"; } #undef DO_ME } /** Returns # of 1394 ports or -1 on error. Takes argv and argc because you can supply the port # on which the reset should be generated (elsewise it happens on the bus connected to DEFAULT_PORT). For us, all the ports on the machine are connected to a single bus, so it makes no difference. It's just a chunk of code I already had. */ int setup_the_bus(raw1394handle_t *h, int argc, char *argv[]) { raw1394_portinfo port_info[MAX_PORTS] = {}; int port_num = argv[1] ? atoi(argv[1]) : DEFAULT_PORT; int status = 0; int num_ports = -1; raw1394handle_t handle = raw1394_new_handle(); if (!handle) { printf("Couldn't find any 1394 interfaces\n"); goto out; } num_ports = raw1394_get_port_info(handle, port_info, MAX_PORTS); if ((num_ports < 0) || (num_ports > MAX_PORTS)) { printf("Confused by # of ports == %d\n", num_ports); num_ports = -1; goto out; } // Ports run 0 to num_ports - 1 if ((port_num < 0) || (port_num >= num_ports)) { printf("Asked to reset port %d of %d: can't!\n", port_num, num_ports); num_ports = -1; goto out; } status = raw1394_set_port(handle, port_num); if (status) { if (errno == ESTALE) printf("Bus info got stale. Poo.\n"); else printf("Error in raw1394_set_port\n"); num_ports = -1; goto out; } // It's cool to do this because 'handle' isn't really a local variable: // it's a pointer to a kernel-allocated structure, so it's safe to point // 'h' here even after the function returns. *h = handle; out: return num_ports; } /** Return 0 if 'host_num' not found, else 1. This ended up being less complicated than I figured. */ int find_a_host(char *buffer, const char * const needle) { char *host_p = strstr(buffer, needle); if (!host_p) { // printf("Couldn't find %s inside this data\n\n%s\n", needle, buffer); return 0; } return 1; } /** Indeed. The thing must be done. Return 0 if haven't lost a port, else -1. Inside main() we caused a bus reset, and here we hope to analyze the data that is pouring into the_file to see if all our ports are staying with us. Here's the idea. 'NEEDLE' contains the base name for a firewire port as used in the_file. Each port adds a number to the end, so for: NEEDLE = fw-host then you'll have fw-host0, fw-host1, etc. That's why the '+ 2' below: 1 for digit, one for \0. So don't have 10+ ports or you'll be sad. So select() has told us there's data (probably, could be EOF), which we use fread() to retrieve. If it is EOF, we chill for awhile, then return. To poll like a bastard, take out the sleep. Assuming we have some data, we look through it for fw-host0, etc. up to the number of hosts that we have determined the system has: this is parameter 'num_ports'. If we don't see a particular port, we increase it's "gone_count" to note that it wasn't in the log. If we don't see it for GONE_TRIGGER consecutive iterations, we stop the program: it's very likely things have broken. It's probably okay to set this to 3 or 4, it's just that the debug info can take a couple of fread()s to arrive, so the information for a particular reset may be split across two calls to this function, so we can't rely on the port not being found once as definitive proof. Might be able to fix this by delaying for awhile after a reset for all the text to trickle in. */ int do_the_thing(FILE *file_p, int num_ports) { static char buffer[BUF_SIZE + 1]; static char needle_plus_host[strlen(NEEDLE) + 2]; static unsigned int gone_count[MAX_PORTS]; strcpy(needle_plus_host, NEEDLE); size_t got = fread(buffer, sizeof(char), BUF_SIZE, file_p); if (got < BUF_SIZE) { if (feof(file_p)) { if (got == 0) // Bumping against EOF: wait for some data. { sleep(1); return 0; } printf("Hit EOF: %u of %u\n", got, BUF_SIZE); } else if (ferror(file_p)) { printf("File error: %s\n", strerror(errno)); return -1; } else { printf("Short read: %u of %u\n", got, BUF_SIZE); } } else { printf("Full read: %u of %u\n", got, BUF_SIZE); } buffer[got + 1] = '\0'; // printf("\n%s\n", buffer); // Scan for missing ports int which_port = 0; for ( ; which_port < num_ports; ++which_port) { needle_plus_host[strlen(NEEDLE)] = '0' + which_port; printf("Looking for string '%s' for port %d\n", needle_plus_host, which_port); int found = find_a_host(buffer, needle_plus_host); if (!found) { printf("\n\nHEY, LOOKS LIKE port # %d is MISSING\n\n", which_port); ++gone_count[which_port]; } else { // Okay, found it: reset count gone_count[which_port] = 0; printf("Found port %d of %d\n", which_port, num_ports); } } // Did this round of checking push any port past the threshold? for (which_port = 0; which_port < num_ports; ++which_port) { if (gone_count[which_port] > GONE_TRIGGER) { printf("Okay, I'm pretty sure port %d is gone: " "missing %d consecutive times\n", which_port, GONE_TRIGGER); return -1; } } return 0; } /** 1) Setup signal handling. 2) Setup raw1394 bus handle and count # ports in machine. 3) Get access to logging data. 4) Loop: provoke bus reset wait for bad things to happen 5) Cleanup. */ int main(int argc, char *argv[]) { struct sigaction action; int error = 0; struct timespec timeout = { 1, 0 }; // select times out after 1 second int num_ports = -1; int fd, ret; unsigned int count = 0; fd_set read_set; raw1394handle_t h = 0; FILE *file_p = 0; // Setup signal handling action.sa_sigaction = signal_handler; action.sa_flags = SA_SIGINFO; error += sigaction(SIGINT, &action, 0); error += sigaction(SIGSEGV, &action, 0); if (error) { printf("Error in sigaction somewheres: %s\n", strerror(errno)); return 0; } else printf("Signal handler installed.\n"); // Find out how many ports we have. Get handle for causing resets. num_ports = setup_the_bus(&h, argc, argv); if (num_ports == -1) { printf("Error setting up 1394 bus access.\n"); goto out; } printf("There are %d 1394 interfaces.\n", num_ports); // Access logging file. file_p = fopen(the_file, "r"); if (!file_p) { printf("Failed to open %s: %s\n", the_file, strerror(errno)); goto out; } fd = fileno(file_p); if (fd == -1) { printf("Couldn't get file descriptor for %s\n", the_file); goto out; } // Go to end of file to wait for new data ret = fseeko(file_p, 0 , SEEK_END); if (ret) { printf("Failed to seek to end of %s: %s\n", the_file, strerror(errno)); goto out; } // Set up select() stuff. FD_ZERO(&read_set); FD_SET(fd, &read_set); // Until Ctl-C (or something bad). while (!stop) { // reset the bus! printf("Causing bus reset: bus has %d nodes...", raw1394_get_nodecount(h)); ret = raw1394_reset_bus(h); if (ret) printf("\n\nFailed to reset bus\n"); else printf("OK!\n"); // Wait for data: probably triggers off EOF mostly. Blah. ret = pselect(fd + 1, &read_set, 0, 0, &timeout, 0); printf("[%8u] Select sez %08x\n", count, ret); if (!FD_ISSET(fd, &read_set)) { printf("Select returned: nope, not us\n"); // bad! continue; } // See who showed up after the reset. ret = do_the_thing(file_p, num_ports); if (ret) break; ++count; } out: printf("Program terminating\n"); (void)fclose(file_p); if (h) raw1394_destroy_handle(h); return 0; }