search.sh

#!/usr/bin/env bash

# search.sh

# INTRODUCTION

# Lines that are comments are prefaced with the symbol '#'.  This script uses
# 'surfraw' to search through a list of terms that are of interest.  A file of
# search terms may be given as a command line argument; otherwise the script
# will look for the file named 'search_terms.txt' that must be located in the
# local directory.

# The results are sent to a date and time stamped directory as identified by
# search term and search engine that can be opened in any html compliant graphical
# web browser as a local file.  The links so contained may be activated from
# there.

# The procedure 'surfraw' was first written by Julian Assange and is today
# maintained by others.  It contains (has built-in) 100+ engines called 'elvi'.
# New 'elvi' may be added; all that is needed is the search url of a web site.
# For example, for Zerohedge.com a search for 'Clinton 2020' generates the url:

# https://www.zerohedge.com/search-content?search_api_fulltext=Clinton+2020&sort_by=search_api_relevance

# This may be used to create a new 'elvi'.  Thus any web site or sites of interest
# may be interrogated with this script if the search url can be obtained.  For more
# information, see:

# https://www.techrepublic.com/blog/linux-and-open-source/how-to-write-your-own-elvi-for-surfraw/

# DEPENDENCIES

# bash, surfraw, lynx, sed and a graphical web browser (like firefox). Each of these
# should be easily installed on any linux system or Mac OS (this has not been tested).

# FILE FORMATS

# Search terms are placed in a file, one term per line.  No empty lines are permitted.
# For example the file 'search_terms.txt' may contain:

# Trump Putin Summit
# Clinton 2020 campaign
# Thailand Cave Rescue
# Prime Minister May Brexit
# Nigel Farage UKIP

# And, for example, the file 'search_engines.txt' may contain:

# dmoz
# duckduckgo
# google
# bing
# cnn
# bbcnews
# yandex

# The engine names contained by the file 'search_engines.txt' must have a corresponding
# 'elvi' in 'surfraw'.

# LICENSE AND WARRANTY

# This is free for use or modification under the MIT License terms.  No warranty is
# expressed or implied. No copyright is claimed at this time.

# INSTALLATION, USAGE AND CAUTIONS

# This file should be saved as 'search.sh'.  To make it executable, do 'chmod +x search.sh'.
# Otherwise it may be run via the command 'bash search.sh'.  To use the executable form, do:

#   ./search.sh
#
# Or, when using a custom search term file name, do:
#
#   ./search.sh my_search_terms.txt

# The program will create a date and time stamped directory that produces one html page per
# search term/search engine based upon the contents of the file 'search_terms.txt' and the
# file 'search_engines.txt'.  Click on these to view the search results in a browser.

# A word of caution on usage.  Certain engines will block access if this procedure is used
# too frequently.  Google in particular is on this list.  So if you find yourself locked out,
# you have been warned.

# ENVIRONMENT AND GLOBALS

export SURFRAW_graphical='no'
export SURFRAW_text_browser='lynx -source'
export SURFRAW_results=100 # this may not work on all engines.

# PROCEDURES

main(){

    local destination_html_file=
    local search_engines_file='search_engines.txt'
    local search_terms_file='search_terms.txt'
    local destination_folder="$(date +%Y-%m-%d\ %T)"

    # If a file name is supplied then use it.  Otherwise default to the file name
    # 'search_terms.txt' that must be in the local directory.  If no file is found
    # then exit.

    # First set the search terms file.
    if [[ -n $1 ]]
    then
        search_terms_file="$1"
    fi

    # Next make sure the search terms file exists.
    if [[ ! -f $search_terms_file ]]
    then
        echo "Oops!  Could not find the file of search terms, exiting ..."
        exit 1
    fi

    # Next make sure the search engines file exists.
    if [[ ! -f $search_engines_file ]]
    then
        echo "Oops!  Could not find the file of search engines, exiting ..."
        exit 1
    fi

    # Next create the search results output directory.
    if [[ ! -d $destination_folder ]]
    then
        if mkdir "$destination_folder" &> /dev/null
        then
            echo "Creating directory $destination_folder ..."
        else
            echo "Oops!  Could not create search directory, exiting ..."
            exit 1
        fi
    fi

    # Finally read the search terms/engines files into an array.
    IFS=$'\n' read -d '' -r -a search_terms < "$search_terms_file"
    IFS=$'\n' read -d '' -r -a search_engines < "$search_engines_file"

    # Now do the search and send the results to a time-stamped html file
    # ordered by search term.
    for st in "${search_terms[@]}"
    do
        for se in "${search_engines[@]}"
        do

            destination_html_file="$destination_folder"'/'"$st"' ('"$se"').html'
            surfraw $se $st > "$destination_html_file"

            # Duckduckgo has some strange prefixes on links; these must be
            # removed.
            if [[ $se == duckduckgo ]]
            then
                echo "Postprocessing $se html output ..."
                sed -e 's/\/l\/?kh=-1&amp;uddg=//g' "$destination_html_file" > "$destination_html_file".tmp &&
                mv "$destination_html_file".tmp "$destination_html_file" &> /dev/null &&
                    echo "Postprocessing $se has succeeded ..." ||
                    echo "Postprocessing $se has failed ..."
            fi

            # Links must have certain URL codes changed to ASCII or
            # clicking on them may fail.
            echo "Making essential URL ASCII conversions ..."
            sed -e 's/%3A/:/g' -e 's/%2F/\//g' "$destination_html_file" > "$destination_html_file".tmp &&
            mv "$destination_html_file".tmp "$destination_html_file" &> /dev/null &&
                echo "Conversion has succeeded ..." ||
                echo "Conversion has failed ..."

        done
    done

}

# INVOKE MAIN PROCEDURE

main $@ && echo "Our work is done, exiting ..." || echo "Exiting, but there was some problem ..."

# end of file