#!/usr/bin/perl
use strict;
use warnings;
use threads;
use threads::shared;

use Socket;
use Getopt::Std;
use Net::IP;
use LWP::UserAgent;

# See if String::Compare is installed
my $compare_available = 0;
eval
{
  require String::Compare;
  String::Compare->import();
};
unless($@)
{
    $compare_available = 1;
}

# Track how long we're running
my $starttime = time();

# Share the search range and page queue so the threads know what to do
my ($ip_queue, %page_queue, %header_queue, $fetching, $checking, $next_ip) :shared;
$fetching = 0;
$checking = 0;

# Get command line options
my ($search_range, $search_url, $search_string, $omit_string, $http_timeout, $threads_retrieve, $threads_check, $verbose) = getparams();

# Set our first and last IP addresses to search
$next_ip = unpack 'N', inet_aton($search_range->ip());
$ip_queue = $search_range->size()->numify();

# Split the search URL out into a protocol, hostname, and path so we can mix and match the pieces later
$search_url =~ m/^(\w+)\:\/\/(.+?)\/(.*)$/ or die('Could not extract URL components');
my ($search_protocol, $search_hostname, $search_path) = ($1, $2, $3);
print "Protocol: $search_protocol; Hostname: $search_hostname; Path: $search_path\n" if ($verbose > 0);

# If we're not string matching set up a variable to store the page we are comparing against and populate it with the contents of the original URL
my $search_page;
unless ($search_string) {
        ($search_page) = retrieve_page($search_url) or die ('Could not retrieve search page');
}

# Spin up retriever threads
print "Starting $threads_retrieve retrieve threads\n" if ($verbose > 0);
for (my $threadcount = 0; $threadcount < $threads_retrieve; $threadcount++) {
        my $thread = threads->create('start_retrieve');
}

# Spin up check threads
print "Starting $threads_check check threads\n" if ($verbose > 0);
for (my $threadcount = 0; $threadcount < $threads_check; $threadcount++) {
        my $thread = threads->create('start_check');
}

# Wait for threads to do their thing
while (my @threads = threads->list()) {
        my @joinable = threads->list(threads::joinable);
    if (@joinable) {
            foreach my $thread (@joinable) {
                    $thread->join();
            }
    } else {
            print "IPs in fetch queue: ". $ip_queue ."; Pages in check queue: ". keys(%page_queue) ."\n" if ($verbose > 0);
        sleep(1);
    }
}

# And we're done
my $runtime = time() - $starttime;
print "Ran in $runtime seconds\n" if ($verbose > 0);
exit;

sub start_retrieve {
        $fetching++;
        my $retrieve_done = 0;
        RETRIEVE: until ($retrieve_done) {
                # Pull an IP address off the list in a locked block
                my $ip;
                {
                        lock($next_ip);
                        lock($ip_queue);
                        if ($ip_queue) {
                $ip_queue--;
                                $ip = inet_ntoa(pack 'N', $next_ip);
                $next_ip++;
                        } else {
                                $retrieve_done = 1;
                                last RETRIEVE;
                        }
                }

                # Retrieve the page
                my ($page, $headers) = retrieve_page($search_protocol ."://". $ip ."/". $search_path, $search_hostname);

                # Lock the queue and add the page if we got one, skip it if we failed
                if ($page) {
                        lock(%page_queue);
                        $page_queue{$ip} = $page;
                        $header_queue{$ip} = $headers;
                        print "$ip retrieved and added to queue\n" if ($verbose > 1);
                } else {
                        print "$ip retrieve failed\n" if ($verbose > 1);
                }
        }
        $fetching--;
}

sub start_check {
        $checking++;
        my $check_done = 0;
        my $wait = 0;
        CHECK: until ($check_done) {
                my ($ip, $page, $headers);

                # Lock the queue and check if there's anything we can grab from it
                {
                        lock(%page_queue);
                        my @keys = keys(%page_queue);
                        if (@keys) {
                                $ip = shift(@keys);
                                $page = $page_queue{$ip};
                                $headers = $header_queue{$ip};
                                delete($page_queue{$ip});
                                delete($header_queue{$ip});
                        } elsif ($ip_queue || $fetching) {
                                $wait = 1;
                        } else {
                                $check_done = 1;
                                last CHECK;
                        }
                }

                # If there's nothing in the queue we should wait
                if ($wait) {
                        sleep(1);
                        $wait = 0;
                }

                # If we're not waiting we process the page we got
                else {
                        # Search string style if we have one
                        if ($search_string) {
                                if ($page =~ /$search_string/) {
                    if ($omit_string && $headers =~ /$omit_string/) {
                                            print "$ip matched string but omitted based on headers\n" if ($verbose > 1);
                    } else {
                                            print "$ip matched string\n";
                    }
                                } else {
                                        print "$ip did not match string\n" if ($verbose > 1);
                                }
                        }
                        # Percent match if we don't
                        else {
                                my $percent = compare_pages($search_page, $page);
                                print "$ip is a $percent\% match\n";
                        }
                }
        }
        $checking--;
}


# Print help message, automatically called by getopts if '--help' is passed on the command line
sub HELP_MESSAGE {
        print <<EOF
noclouds.pl command line options:
-u: url to match against
-i: ip range to search
-s: string to search for instead of percent matching, mandatory if String::Compare is not installed
-o: optionally omit results where the header matches the provided string, only works if used in conjunction with -s
-t: optional HTTP connection timeout
-r: optional retrieve thread count (default 128)
-c: optional check thread count (default 4)
-v: optional verbose mode (Current useful values 0-2)
EOF
        ;
exit;
}

# Get and return our command line parameters
sub getparams {
        my %opts;
        getopts('u:i:s:o:t:r:c:v:', \%opts);

        # Optional verbose mode to increase the amount of noise we generate while scanning
        my $verbose = $opts{'v'};
        unless ($verbose) {
                $verbose = 0;
        }

        # Target URL
        my $search_url = $opts{'u'} or die ("No target URL provided");

        # IP range to search
        my $search_range = new Net::IP ($opts{'i'}) or die ('No valid search range supplied');

        # String to search for in target pages, this is optional
        my $search_string = $opts{'s'};

        # String to search for in target headers and omit results, this is optional
        my $omit_string = $opts{'o'};

        # Optional timeout for HTTP requests
        my $http_timeout = $opts{'t'};

        # Optional retreive thread count
        my $threads_retrieve = $opts{'r'};
        unless ($threads_retrieve) {
                $threads_retrieve = 128;
        }

        # Optional check thread count
        my $threads_check = $opts{'c'};
        unless ($threads_check) {
                $threads_check = 4;
        }

        # Set our search parameters
        if ($search_string) {
                print "Searching for $search_string\n" if ($verbose > 0);
        } else {
        if ($compare_available) {
                    print "Checking percentage match against $search_url\n" if ($verbose > 0);
        } else {
            die('String::Compare is not available, provide a search string with -s');
        }
        }

        return ($search_range, $search_url, $search_string, $omit_string, $http_timeout, $threads_retrieve, $threads_check, $verbose);
}

# Retrieve a page and return its contents based on a URL, optionally set the hostname header
sub retrieve_page {
        my ($url, $hostname) = @_;

        # Create a user agent object, spoofing Safari 6.0.4 user agent headers so we can be stealthy
        my $ua = LWP::UserAgent->new;
        $ua->agent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/536.29.13 (KHTML, like Gecko) Version/6.0.4 Safari/536.29.13');
        if ($http_timeout) {
                $ua->timeout($http_timeout);
        }
        my $req = HTTP::Request->new(GET => $url);

        # Set the Host header if requested to deal with name based virtual hosting
        if ($hostname) {
                $req->header(Host => $hostname);
        }

        # Get the page
        my $res = $ua->request($req);

        # Mark our result whether or not we succeeded in the request and add the page content if we have it
        if ($res->is_success()) {
        my $headers = join('; ', $res->header_field_names());
                return($res->content(), $headers);
        } else {
                return(undef(), undef());
        }
}

# Compare a master copy of a page to a page object and return percentage of how similar they are
sub compare_pages {
        my ($original_page, $target_page) = @_;
        if ($target_page) {
                return(compare($original_page, $target_page) * 100);
        } else {
                return(undef());
        }
}