textsearch.pl

#!/usr/bin/perl

#################
## Searching MacBeth for text
## MacBeth text is here: http://pastebin.com/PV5YUBuA
#################

# Set the content type for the page
print "Content-type: text/html\n\n";

# Set the HTML header and default form
print <<EOF;
<HTML>
 <HEAD>
  <TITLE>Search Macbeth! - Coding 101 Episode 26</TITLE>
 </HEAD>
 <BODY>
  <H1>Search Macbeth</H1>
  <FORM METHOD="GET" ACTION="textsearch.pl">
    <STRONG>What do you want to search for in MacBeth?</STRONG>
    <INPUT TYPE="TEXT" NAME="search"><br />
    <INPUT TYPE="SUBMIT">
  </FORM>
EOF

# Get the querystring and check if there's anything there
$qs = $ENV{'QUERY_STRING'};

# For Testing via perl on the commandline
#$qs="search=toil";

#If the length of querystring is 0 then we have nothing to do.
if (length ($qs) > 0){

    # Process the querystring into a hash of name/value pairs
    # In this case it should only be "search=xxxxx"

    # First split at the & mark. In our case there should be
    # only be one form variable passed but if there were more than
    # for example textsearch.pl?search=toil&count=5 this would
    # create an array with 2 elements: ("search=toil","count=5");
    @nvpairs = split(/&/, $qs);

    # Next loop throuth that array and do another split on the = sign
    # and assign the result to the variables $name and $value.
    foreach $nvpair (@nvpairs){
        ($name, $value) = split(/=/, $nvpair);

        # This is standard HTML form submission code to convert special
        # characters back to their proper string equivalent.

        # In this case I need to convert spaces. If I searched for "to be"
        # the browser would submit it as "to+be" so I need to convert all plusses
        # to spaces. If they submitted a plus sign in the form it would show
        # as %2B and get corrected next.
        $value =~ tr/+/ /;

        # Next, if my search term included an apostrophe, the browser would
        # submit it as "%27" so I need this to convert "that%27s" to "that's"
        $value =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg;

    # Finally, I create a hash variable to store name value pairs
    # This is like a dictionary in Python.
        $form{$name} = $value;
    }

    # I want to check that the value(s) that I'm insterested in were submitted.
    # If not, then I can end the script.
    if ($form{'search'}) {
        #I have a search term. So I can proceed

        # This is the Macbeth text file
        $file = "macbeth.txt";

        print "<hr><h2>Search Results:</h2><hr>";

    # Open the file or give an error message
        open MY_FILE, "<", $file or die "Could not open \"$file\".";

        # Loop through until the end of the file
        while(<MY_FILE>) {
            # Remove the \n newline character at the end of each line and
            # replace with the HTML <br> tag
            s/\n$/<br>/;

            # Append each line read from $_ (Perl's default variable when
            # reading from a file) into my variable called $fulltext
            $fulltext .= $_;
        }

        # Close the file
        close MY_FILE;

        # This text would have underlined the speaking character but I've moved it
        # into the loop. It may be a speed concern. Will need to investigate
        #$fulltext =~ s/([A-Z\ ]+)\./<u>$1<\/u>: /gs;

        # The file is formatted with lots of line breaks within lines and a double linebreak
        # between them so these regexes will help to fix that on the fly without altering the file.
    # Use split to create an array of paragraphs, splitting on the double new line character
    # which should now put each line of dialog in a separate paragraph. The file is
        # formatted with lots of line breaks within paragraphs and a double linebreak
        # between paragraphs. This will help to fix that on the fly without altering the file.
        @paragraphs =  split/<br>(.<br>)+/gs, $fulltext;

    # Loop through the paragraph array and count the number of occurrances of the search term
        for $paragraph (@paragraphs) {
            # Grep is useful for counting matches and placing them in a scalar variable
            # or actually putting the matches into an array. I don't need the matches here.
            $count = grep /$form{'search'}/gis, $paragraph;

            # If I got at least one match in this paragraph, I want to print it out.
            if ($count > 0) {
                # Format the text so that the name of the character speaking is underlined.
                $paragraph =~ s/([A-Z\ ]+)\./<u>$1<\/u> /gs;

                #We're going to do a substitution in the paragraph to add HTML bolding to
                # the search term and print out the paragraph.
                $searchreplace = "($form{'search'})";

                # Bold the search term
                $paragraph =~ s/$searchreplace/<strong>$1<\/strong>/ugis;

                # Now print out the result bordered by an HTML hard rule line
                print  "$paragraph \n<hr>\n";
            }
        }
    } else {
        # We're here because the query received was not the one that was expected.
        # Conclusion is that someone is trying to  hack the program.
        print "<h1><blink>STOP HACKING MY PROGRAM</blink></h1>";
    }
} else {
  # We're here because I didn't receive a query. This is not an error.
}

# End the HTML
print " </BODY>
</HTML>";

exit;