#!/usr/local/bin/perl # go4check, v1.3 # #------------------------------------------------------------------------------- # Introduction # go4check checks gopher links, probing each connection and testing the # output received. It handles most types of links, reporting if the link # is ok, the host serving it is down/refusing connections, or its pathname # has changed. It is not 100% successful at this, especially when it # comes to gopher0 servers, but does indeed help you keep on top of links # in your server(s). # # To run, go4check requires only perl and socket.ph. It understands # gopher0 and gopher+ servers. # # go4check produces a line of output on stdout for each item appearing # in a gopher's menu: the name of the item plus a result. Indentation # serves to maintain items in context so problems can be located easily. # As an extra benefit, go4check's output can be used as a roadmap of # the gopher after some rather trivial editing to remove results. # # go4check is written by George A. Theall, George.A.Theall@mail.tju.edu. # You may freely use and redistribute this. I can not offer any # support for this but am interested in your comments, suggestions, # and problem reports. # # The latest version is available via gopher as: # gopher://tjgopher.tju.edu/00/networks/internet/tools/gopher/go4check # # Note: Version 1.3 will probably be the last version of go4check I release. # #------------------------------------------------------------------------------- # Operation # Before you run go4check, make sure perl and the header file socket.ph are # available on your system. [You can generate this file by running the perl # utility h2ph on /usr/include/sys/socket.h, or something similar.] # # Invoke go4check with the name of the server to check and an optional port # number. Other options can be used to specify a non-standard starting # path or generate copious debugging info. go4check will test the items # listed in the initial menu and recurse into any menus it finds as long # as the names of server it finds match the one specified at go4check's # invocation. go4check does, though, skip recursion if pathnames refer # to ftp gateways or point back to the initial entry point. # # Results are directed to stdout, so you probably will want to redirect # to a file. You might then remove instances of "...ok.", which # indicate no problems and finally search on "...can't connect.", # "...path changed.", and "...timed out.". Another possible result # is "...n/a.", which is used when go4check doesn't know how to check # a particular type of link. # # You may want to tune the variables that go4check uses for testing # items of type 2 and 7. See below where initial values are defined. # For items of type 2, go4check sends a invalid command, which causes # many CSO servers to respond in a way that go4check interprets as a # success. As for items of type 7, I don't know of any robust way # to test searches. Currently, the best solution appears to be # to search for a word that's common to whatever searches are in the # gopher being checked. # # go4check is slow; it probably belongs in a cron job to run at night. # #------------------------------------------------------------------------------- # History # 15-Mar-95, GAT, v1.3 # - Added ability to avoid recursing into selected paths. Paths are # tested using substr() so you can have go4check check a directory # but not recurse further by appending a "/", if that's what you want. # # 27-Feb-95, GAT, v1.2 # - Moved alarm for connecting to within the tcpconnect subroutine # to handle better time-out conditions. # - Wrapped initial gopher connection and telnet check with alarms. # # 31-Jan-95, GAT, v1.1 # - Alarms are now used to abort connections that are otherwise hung. # - Added patches from R.D. Cameron for supporting type 7 items with # non-empty paths and checking error returns of type 3. # - Fixed glitch that arose on some servers (gopher.uwsp.edu for one) # that return lines with non-standard endings. # - Explicitly added an assignment for $| and set it to true so output # will be flushed after every print. # # 17-Oct-94, GAT # - Added a semicolon after a line in make_URL. Its lack appears to # cause problems with some versions of Perl. # # 01-Sep-94, GAT, v1.0 # - Released publically. # # 10-Aug-94, GAT, v1.0b2 # - Added $snooze_length as a way to control how long to pause after # establishing a connection. # - Fixed initialization of %URLs. # - Changed format of internal URLs by removing ":" from between type # and path info. # - Used a configurable word to check search items. # - Added check of CSO servers. # - Adjusted regular expression used to check success/failure of # a link. # - Documented go4check's operation. # # 12-Jul-94, GAT, v1.0b1 # - Used pseudo URLs internally for storing links so they are not # checked more than once. # - Added support for most types of links, including telnet, binary # files, and searches. # - Used gopher+ protocol whenever possible to avoid retrieving # entire files. # # 09-Jun-94, GAT, v1.0a # - First version of go4check. Checks only files and directories. # #------------------------------------------------------------------------------- # Specify where perl can find include files. push(@INC, "/usr/local/lib/perl"); # Define initial values for selected variables. $| = 1; # flush after every print? $default_path2 = "helo"; # for searching type 2 items $default_search_term = "cancer"; # for searching type 7 items @excluded_paths = ( # paths to exclude "1/tjgopher/changes", "1/tju/atrium/", "1/tju/nutrition/atrium", "1/tju/jeffnews/current", "1/tju/jeffnews/backissues/", "1/tju/marketing/jeffnews", "1/gophers/bylocation/tju", "1/gophers/bylocation/philly"); $Indent = " "; # indentation at each level $snooze_length = 3; # time to snooze before connect $timeout = 180; # max len of connect (seconds) %URLs = (); # array of URL's on server # Check for options. $DEBUG = 0; # default to no debug if ($ARGV[0] eq '-d') { shift; $DEBUG = 1; } # Parse commandline args and provide help as needed. $inithost = shift || ""; # name of host to check $initport = shift || 70; # port number $initpath = shift || ""; # initial directory if ($inithost eq "" || $inithost eq "-?") { print "$0 checks links in a gopher by probing connections\n\n"; print "Usage: $0 [-d] host [port] [\"path\"]\n"; print " unless specified, port defaults to 70 and path to \"\".\n"; print " -d is used for debugging.\n"; exit(9); } # Set up subroutines to catch some alarms. $SIG{'ALRM'} = handle_Timeout; # Establish connection and check links. require 'sys/socket.ph'; chop($thishost = `hostname`); # needed for tcpconnect &check_Links($inithost, $initport, $initpath); exit(0); ######################################################################## # check_Links - checks links for a given directory. # # # # Notes: # # - Links on the same host will be followed unless they point to # # the root. While this will prevent most recursion, there may # # be some gophers with odd setups that lead to infinite loops. # # - FTP links are not followed. # # Entry: # # host = hostname # # port = port number # # path = selector string # # Exit: # # New links are appended to @URLs. # ######################################################################## sub check_Links { local($host, $port, $path) = @_; local($margin) = $Indent . $margin; local($stat); local(@Items); # Establish connection and read contents. $DEBUG && print "DEBUG: connecting to $host at port $port.\n"; ($GOPHER) = &tcpconnect($host, $thishost); if ($@ && $@ =~ /Timed Out/) { die "$@"; } ($GOPHER) || die "Can't connect"; $DEBUG && print "DEBUG: sending path \"$path\".\n"; eval { alarm($timeout); send($GOPHER, "$path\r\n", 0); @Items = <$GOPHER>; close($GOPHER); alarm(0); }; if ($@ && $@ =~ /Timed Out/) { die "$@"; } # Check each item, recursing into directories as necessary. foreach (@Items) { local($atype, $aname, $apath, $ahost, $aport, $aextra); s/\s*$//; # remove \r\n combo last if (/^\.$/); # done if line is just a period # Check status of each unique URL. $url = &make_URL($_); s/^(.)// && ($atype = $1); ($aname, $apath, $ahost, $aport, $aextra) = split(/\t/, $_); chop($ahost) if ($ahost =~ /\.$/); if (defined($URLs{$url})) { # already checked print "$margin$aname...$URLs{$url}.\n"; } else { $stat = ($URLs{$url} = &test_URL($url, $aextra)); print "$margin$aname...$stat.\n"; } # Recurse as necessary. if ($stat eq "ok" && $atype == 1 && $ahost eq $inithost && $aport eq $initport && $apath ne "" && &is_Excluded($apath) == 0 && $apath !~ /ftp.*:/) { &check_Links($ahost, $aport, $apath); } } } ################################################ # make_URL - constructs a URL from a string. # # # # Notes: # # - The URLs generated here are not 100% # # kosher, only used internally. # # # # Entry: # # string as passed by gopher server. # # Exit: # # string representing URL. # ################################################ sub make_URL { local($_) = @_; local($url); local($type, $name, $path, $host, $port); s/^(.)// && ($type = $1); ($name, $path, $host, $port) = split(/\t/, $_); chop($host) if ($host =~ /\.$/); if ($type =~ /[01245679sgMhIi]/) { $url = "gopher://$host:$port/$type$path"; } elsif ($type =~ /[8T]/) { $url = "telnet://"; $path !~ /^$/ && $url .= "$path@"; $url .= $host; $port > 0 && $url .= ":$port"; $url .= "/"; } return($url); } ########################################################################### # test_URL - check that a URL is accessible. # # # # Notes: # # - I don't have a good way to check gopher0 servers. Currently, I # # look for the string "error.host", which servers like gn seem to # # generate. However, this fails with KA9Q, for which an error # # message is indistinguishable from regular text. # # - For gopher+, a error code indicating a server is too busy is # # treated as an error. This may not be the right thing to do. # # - If the server understands gopher+, we'll only ask for info (!) # # so as not to retrieve large files. This approach also seems to # # be the only way to check ASK blocks reliably. # # - CSO nameservers (type 2) are checked with an invalid command - # # this returns a warning message from the server that is not # # regarded as an error by go4check. Using the command "fields" # # does *not* work since this typically results in lines starting # # with -2, which look like errors. # # - Checks of telnet links only see if host is up; no attempt # # is made to login to whatever account may be specified. # # - Checks of FTP links could be improved. Currently, the info # # returned is not examined beyond looking for the usual signs # # of failure. # # Entry: # # URL = URL to test # # GPLUS = extra character indicating a gopher+ item. # # Exit: # # Text string indicating status of URL: # # "ok" = everything ok # # "can't connect" = can't connect to host # # "path changed" = path changed # # "n/a" = unknown status # ########################################################################### sub test_URL { local($_, $gplus) = @_; local($protocol, $logonid, $host, $port, $type, $path); local($1, $2, $3, $4, $5); $DEBUG && print "DEBUG: checking $_.\n"; m#^(\w+)://(.*):(\d+)/?(.?)(.*)#; $protocol = $1; $host = $2; $port = $3; $type = $4; $path = $5; if ($host =~ /@/) { ($logonid, $host) = split(/@/, $host); } $DEBUG && print "protocol=$protocol; logonid=$logonid; host=$host; port=$port; type=$type; path=$path.\n"; # Check gopher links. if ($protocol eq "gopher") { local($GOPHER); local($Stuff); $DEBUG && print "DEBUG: checking gopher at $host;$port.\n"; ($GOPHER) = &tcpconnect($host, $thishost); if ($@ && $@ =~ /Timed Out/) { return "timed out"; } ($GOPHER) || return "can't connect"; $path .= "\t!" if ($gplus); # Modify selector to get only info if ($type eq "2") { $path = $default_path2 if ($path =~ /^$/); } elsif ($type eq "7") { # Modification Oct. 19/94 by R.D. Cameron to append # handle the nonempty $path case: to test in this # case, we send a tab and the search term after the # $path. if ($path =~ /^$/) { $path = $default_search_term; } else { $path = "$path\t$default_search_term"; } $path =~ s#^waissrc:(.*)/.*$#1$1#; } $DEBUG && print "DEBUG: sending path \"$path\".\n"; eval { alarm($timeout); send($GOPHER, "$path\r\n", 0); $Stuff = <$GOPHER>; close($GOPHER); alarm(0); }; if ($@ && $@ =~ /Timed Out/) { return "timed out"; } $DEBUG && print "DEBUG: read \"$Stuff\".\n"; # Test line for signs of errors. # # Modification Oct. 19/94 by R.D. Cameron to # check for type 3 error returns when a directory # listing is expected. (According to the gopher # protocol, "3" as the first character of a directory # entry always indicates error. if ((($type eq "1") | ($type eq "7")) & ($Stuff =~ /^3/)) { return("path changed"); } # Test line for other signs of errors. elsif ($Stuff =~ /(^\-\-\d)|(\terror.host\t\d+)/) { return("path changed"); } else { return("ok"); } } # Check telnet links. if ($protocol eq "telnet") { local($TELNET); $DEBUG && print "DEBUG: checking telnet at $host;$port.\n"; ($TELNET) = &tcpconnect($host, $thishost); if ($@ && $@ =~ /Timed Out/) { return "timed out"; } ($TELNET) || return "host down"; return "ok"; close($TELNET); } # If we get here, we don't know how to test the link. return("n/a"); } ####################################################### # is_Excluded - checks if a path is to be excluded. # # # # Entry: # # path to be tested. # # Exit: # # 0/1 indicating no/yes. # ####################################################### sub is_Excluded { local($path) = @_; for (@excluded_paths) { if (index($path, $_) >= $[) { return(1); } } return(0); } ################################################################ # This comes from gopherhunt by Paul Lindner. # # # # I've added a line to abort if it can't resolve an address. # # and return 0 if failure rather than die. GAT # # # # I also added an alarm to handle time-out conditions. GAT # ################################################################ sub tcpconnect { #Get TCP info in place local($host, $hostname) = @_; local($name, $aliases, $type, $len); local($thisaddr, $thataddr, $this, $that); local($sockaddr); $sockaddr = 'S n a4 x8'; ($name,$aliases,$proto) = getprotobyname('tcp'); ($name,$aliases,$port) = getservbyname($port, 'tcp') unless $port =~ /^\d+$/; ($name,$aliases,$type,$len,$thisaddr) = gethostbyname($hostname); ($name,$aliases,$type,$len,$thataddr) = gethostbyname($host); $name || return(0); $this = pack($sockaddr, &AF_INET, 0, $thisaddr); $that = pack($sockaddr, &AF_INET, $port, $thataddr); sleep($snooze_length); eval { alarm($timeout); socket(N, &PF_INET, &SOCK_STREAM, $proto) || return(0); bind(N, $this) || return(0); connect(N, $that) || return(0); alarm(0); }; return(N); } ##################################################### # handle_Timeout - Die with a specific message. # # # # Notes: # # - Calls to alarm() should be in an eval # # block. # # # # Entry: # # n/a # # Exit: # # Message "Timed Out" is returned. # ##################################################### sub handle_Timeout { $DEBUG && print "DEBUG: Timed Out.\n"; die "Timed Out"; }