#!/bin/perl
# -*- perl -*-
#
#  Copyright (c) 1997, 2000 DJ Delorie, All Rights Reserved.  NO WARRANTEE.
#

# Note: headers.cgi is used by a Firefox extension.
#if ($ENV{'HTTP_REFERER'}
#    && $ENV{'HTTP_REFERER'} !~ m@^http://www.delorie.com/web/@) {
#    print "Location: http://www.delorie.com/web/\n\n";
#    exit 0;
#}

if ($ENV{'HTTP_USER_AGENT'} =~ /robot|spider|wget|crawl|slurp|googlebot|openfind/i) {
    exit 0;
}

sub rebase_url {
    local($tag, $uri, $rest) = @_;

    if ($uri =~ /^http:/) {
	$u = $uri;
    } elsif ($uri =~ /^[a-z]+:/) {
	if ($tag) {
	    return "$tag=\"$uri\"$rest";
	} else {
	    return $uri;
	}
    } elsif ($uri =~ m@^/@) {
	$u = $urlhead . $uri;
    } else {
	$u = $urlpath . $uri;
	0 while $u =~ s@/\./@/@g;
	0 while $u =~ s@/[^/]+/\.\./@/@g;
	0 while $u =~ s@/[^/]+/\.\.$@/@g;
	$u = $urlhead . $u;
    }

    if ($u !~ /\.(pdf|zip|gz|exe|gif|jpg|png)/) {
	$u =~ s/([^a-zA-Z0-9\._\#-])/sprintf("%%%02x", ord($1));/ge;
	$u = $me . $u;
    }

    if ($tag) {
	return "$tag=\"$u\"$rest";
    } else {
	return $u;
    }
}

sub canonical_url {
    local($base, $uri) = @_;
    local($urlhead, $urlpath) = $base =~ m@([^/]+/+[^/]+)(.*)@;
    $urlpath =~ s@/[^/]+$@/@;
    $urlpath = "/" unless $urlpath;

    if ($uri =~ /^[a-z]+:/) {
	return $uri;
    } elsif ($uri =~ m@^/@) {
	$u = $urlhead . $uri;
    } else {
	$u = $urlpath . $uri;
	0 while $u =~ s@/\./@/@g;
	0 while $u =~ s@/[^/]+/\.\./@/@g;
	0 while $u =~ s@/[^/]+/\.\.$@/@g;
	$u = $urlhead . $u;
    }
    return $u;
}

#-----------------------------------------------------------------------------

sub try_one {
    my ($url, $ref) = @_;
    $attempts .= "<p>Attempted URL:<tt> $url </tt>\n";
    $try_access_ok = &webget($url, $ref);
    $try_access_ok .= "</pre>";
    if ($webstatus eq "404") {
	$attempts .= " (not found)";
    } elsif ($webstatus eq "403") {
	$attempts .= " (access denied)";
    } else {
	$access_ok .= $try_access_ok;
    }
    return $webstatus =~ /^[123]/;
}

sub check_delorie_htm {
    my ($url) = @_;
    my ($ref) = "http://$ENV{'SERVER_NAME'}/web/howto-allow.html (on behalf of $ENV{'REMOTE_ADDR'})";
    my ($ourl) = $url;
    $attempts = '';

    ($host) = $url =~ m@[a-z]*://([^/:]+)@;
    print STDERR "url `$url` host `$host`\n";
    if ($host =~ m/delorie.com|thomii.com|localhost|^[0-9\.]+$/) {
	print "Please don't scan my own hosts.\n";
	exit 0;
    }

    #return if $ENV{'REMOTE_ADDR'} eq "207.22.48.171";

    $url =~ s@\?.*@@;
    $url =~ s@/[^/]*$@/delorie.htm@;
    return if &try_one($url, $ref);

    if ($url =~ m@/\~@) {
	$url =~ s@(/\~[^/]+)/.*$@$1/delorie.htm@;
	return if &try_one($url, $ref);
    }

    if ($url !~ m@//[^/]+/delorie.htm$@) {
	$url =~ s@(//[^/]+)/.*$@$1/delorie.htm@;
	return if &try_one($url, $ref);
    }

    $url = $ourl;
    $url =~ s@\?.*@@;
    $url =~ s@/[^/]*$@/delorie.html@;
    return if &try_one($url, $ref);

    if ($url =~ m@/\~@) {
	$url =~ s@(/\~[^/]+)/.*$@$1/delorie.html@;
	return if &try_one($url, $ref);
    }

    if ($url !~ m@//[^/]+/delorie.htm$@) {
	$url =~ s@(//[^/]+)/.*$@$1/delorie.html@;
	return if &try_one($url, $ref);
    }

    $url = $ourl;
    $url =~ s@\?.*@@;
    $url =~ s@/[^/]*$@/delorie.gif@;
    return if &try_one($url, $ref);

    if ($url =~ m@/\~@) {
	$url =~ s@(/\~[^/]+)/.*$@$1/delorie.gif@;
	return if &try_one($url, $ref);
    }

    if ($url !~ m@//[^/]+/delorie.htm$@) {
	$url =~ s@(//[^/]+)/.*$@$1/delorie.gif@;
	return if &try_one($url, $ref);
    }

    print "\n";
    print "<hr><p align=center>Sorry, but due to abuse, this service cannot access\n";
    print "sites that do not <a href=\"howto-allow.html\">explicitly\n";
    print "allow it</a>.</p>\n";

    print "<h1 align=center>Not Permitted</h1>\n";

    print "<p>You need to create a file called <b>delorie.htm</b> or\n";
    print "<b>delorie.gif</b> on your\n";
    print "web server to prove you're the webmaster.  When I see this file (it can be empty)\n";
    print "I'll allow my tools to access your site.</p>\n";

    $attempts =~ s/\r//g;
    $attempts =~ s@\n\n+@\n@g;
    print $attempts, "</p>\n";

    if ($access_ok) {
	print "<hr><br><br>\n";
	print "<p>Here is a copy of the error messages I got from the server\n";
	print "you were attempting to look at, in case that helps you.\n";
	print "Note that these are <em>not</em> errors with <em>my</em> tools, so pleaes\n";
	print "do <em>not</em> tell me my tools are broken because of these.</p><center>\n";

	print "<table border=1 cellspacing=0 cellpadding=5><tr><td bgcolor=\"#ffddbb\">\n";

	$access_ok =~ s/<img/<nimg/gi;
	$access_ok =~ s/<embed/<nembed/gi;
	$access_ok =~ s/<style/<nstyle/gi;
	$access_ok =~ s/<i?frame/<nframe/gi;
	$access_ok =~ s/<meta/<nmeta/gi;

	print $access_ok;
	print "</td></tr></table></center>\n";
    } else {
	open(H, "howto-allow.html");
	while (<H>) {
	    next if /\#perl/;
	    print;
	}
	close H;
    }
    sleep (60);
    exit 0;
}

sub webget {
    my ($url, $referer) = @_;

    if ($url =~ m@^[a-z]+://[^/]+$@) {
	$url .= "/";
    }
    print STDERR "webget $url\n";

    if ($url =~ /ebay.com/ && $ENV{'REMOTE_ADDR'} ne "207.22.48.171") {
	sleep (60);
	return '<h1>Error</h1>eBay is off limits.';
    }

    if ($times_webgot++ > 5) {
	sleep (60);
	return "<h1>Error</h1>too many levels of HTTP redirection\n";
    }

    if ($url !~ m@/delorie.(htm|gif)@ && ! $no_access_check) {
	&check_delorie_htm($url);
    }

    exit 0 if $host =~ /delorie\.com/i && $ENV{'SERVER_NAME'} =~ /delorie.com/;
    exit 0 if $host =~ /thomii\.com/;
    exit 0 if $host =~ /localhost/;
    exit 0 if $host =~ /^[0-9\.]+$/;

    my ($host,$port,$uri) = $url =~ m@http://([^/:]+)(:\d+)?(.*)@;
    if ($ENV{'SERVER_NAME'} =~ /not.delorie.com/) {
	$chost = "delorie.com";
	$uri = $url;
	$port = 9000;
    } else {
	$chost = $host;
	$port =~ s/://;
	$port = 80 unless $port;
	$uri = "/" unless $uri;
    }

    my ($name,$aliases,$type,$len,$addr) = gethostbyname($chost);
    if (! $addr) {
	return "<h1>Error</h1>Host $chost unknown";
    }
    $them = pack($sockaddr, &AF_INET, $port, $addr);

    unless (socket(S, &AF_INET, &SOCK_STREAM, $proto)) {
	return "<h1>Error</h1>Unable to create socket";
    }

    unless (connect(S, $them)) {
	return "<h1>Error</h1>Unable to connect to $chost:$port: $!";
    }

    select(S); $| = 1; select(STDOUT);

    print "webgot: $url\n" unless $ENV{'SERVER_NAME'};
    $webgot_url = $url;
    print S "GET $uri HTTP/1.0\r\n";
    print S "Host: $host\r\n";
    print S "User-Agent: $agent\r\n" if $agent;
    if ($ENV{"HTTP_PRAGMA"} =~ /no-cache/) {
	print S "Pragma: no-cache\r\n";
    }
    if ($referer) {
	print S "Referer: $referer\r\n";
    }
    print S "\r\n";

    $line = scalar(<S>);
    ($webstatus) = $line =~ m@ (\d+)@;
    print STDERR "webstatus $webstatus\n";
    if ($webstatus ne "200") {
	$resp = "\n<h1>Error</h1>Server returned error code $webstatus<pre>";
	$resp .= "Request: GET $uri HTTP/1.0\n";
	$resp .= $line;
	while (<S>) {
	    $resp .= $_;
	}
	close(S);
	if ($webstatus eq '302' || $webstatus eq '301') {
	    ($loc) = $resp =~ m@Location:\s*(.*\S)@;
	    $loc = &canonical_url($url, $loc);
	    return &webget($loc);
	}
	return $resp;
    }

    # Read the response header
    while (<S>) {
	$web_header .= $_ unless /Content-length/i || /^\s/;
	last unless /\S/;
    }

    $resp = "";
    # Read the data
    while (<S>) {
	s/meta http-equiv=/nometa x=/gi;
	$resp .= $_;
    }

    close(S);
    print STDERR "normal return\n";
    return $resp;
}

1;
