Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env bash
- # search.sh
- # INTRODUCTION
- # Lines that are comments are prefaced with the symbol '#'. This script uses
- # 'surfraw' to search through a list of terms that are of interest. A file of
- # search terms may be given as a command line argument; otherwise the script
- # will look for the file named 'search_terms.txt' that must be located in the
- # local directory.
- # The results are sent to a date and time stamped directory as identified by
- # search term and search engine that can be opened in any html compliant graphical
- # web browser as a local file. The links so contained may be activated from
- # there.
- # The procedure 'surfraw' was first written by Julian Assange and is today
- # maintained by others. It contains (has built-in) 100+ engines called 'elvi'.
- # New 'elvi' may be added; all that is needed is the search url of a web site.
- # For example, for Zerohedge.com a search for 'Clinton 2020' generates the url:
- # https://www.zerohedge.com/search-content?search_api_fulltext=Clinton+2020&sort_by=search_api_relevance
- # This may be used to create a new 'elvi'. Thus any web site or sites of interest
- # may be interrogated with this script if the search url can be obtained. For more
- # information, see:
- # https://www.techrepublic.com/blog/linux-and-open-source/how-to-write-your-own-elvi-for-surfraw/
- # DEPENDENCIES
- # bash, surfraw, lynx, sed and a graphical web browser (like firefox). Each of these
- # should be easily installed on any linux system or Mac OS (this has not been tested).
- # FILE FORMATS
- # Search terms are placed in a file, one term per line. No empty lines are permitted.
- # For example the file 'search_terms.txt' may contain:
- # Trump Putin Summit
- # Clinton 2020 campaign
- # Thailand Cave Rescue
- # Prime Minister May Brexit
- # Nigel Farage UKIP
- # And, for example, the file 'search_engines.txt' may contain:
- # dmoz
- # duckduckgo
- # google
- # bing
- # cnn
- # bbcnews
- # yandex
- # The engine names contained by the file 'search_engines.txt' must have a corresponding
- # 'elvi' in 'surfraw'.
- # LICENSE AND WARRANTY
- # This is free for use or modification under the MIT License terms. No warranty is
- # expressed or implied. No copyright is claimed at this time.
- # INSTALLATION, USAGE AND CAUTIONS
- # This file should be saved as 'search.sh'. To make it executable, do 'chmod +x search.sh'.
- # Otherwise it may be run via the command 'bash search.sh'. To use the executable form, do:
- # ./search.sh
- #
- # Or, when using a custom search term file name, do:
- #
- # ./search.sh my_search_terms.txt
- # The program will create a date and time stamped directory that produces one html page per
- # search term/search engine based upon the contents of the file 'search_terms.txt' and the
- # file 'search_engines.txt'. Click on these to view the search results in a browser.
- # A word of caution on usage. Certain engines will block access if this procedure is used
- # too frequently. Google in particular is on this list. So if you find yourself locked out,
- # you have been warned.
- # ENVIRONMENT AND GLOBALS
- export SURFRAW_graphical='no'
- export SURFRAW_text_browser='lynx -source'
- export SURFRAW_results=100 # this may not work on all engines.
- # PROCEDURES
- main(){
- local destination_html_file=
- local search_engines_file='search_engines.txt'
- local search_terms_file='search_terms.txt'
- local destination_folder="$(date +%Y-%m-%d\ %T)"
- # If a file name is supplied then use it. Otherwise default to the file name
- # 'search_terms.txt' that must be in the local directory. If no file is found
- # then exit.
- # First set the search terms file.
- if [[ -n $1 ]]
- then
- search_terms_file="$1"
- fi
- # Next make sure the search terms file exists.
- if [[ ! -f $search_terms_file ]]
- then
- echo "Oops! Could not find the file of search terms, exiting ..."
- exit 1
- fi
- # Next make sure the search engines file exists.
- if [[ ! -f $search_engines_file ]]
- then
- echo "Oops! Could not find the file of search engines, exiting ..."
- exit 1
- fi
- # Next create the search results output directory.
- if [[ ! -d $destination_folder ]]
- then
- if mkdir "$destination_folder" &> /dev/null
- then
- echo "Creating directory $destination_folder ..."
- else
- echo "Oops! Could not create search directory, exiting ..."
- exit 1
- fi
- fi
- # Finally read the search terms/engines files into an array.
- IFS=$'\n' read -d '' -r -a search_terms < "$search_terms_file"
- IFS=$'\n' read -d '' -r -a search_engines < "$search_engines_file"
- # Now do the search and send the results to a time-stamped html file
- # ordered by search term.
- for st in "${search_terms[@]}"
- do
- for se in "${search_engines[@]}"
- do
- destination_html_file="$destination_folder"'/'"$st"' ('"$se"').html'
- surfraw $se $st > "$destination_html_file"
- # Duckduckgo has some strange prefixes on links; these must be
- # removed.
- if [[ $se == duckduckgo ]]
- then
- echo "Postprocessing $se html output ..."
- sed -e 's/\/l\/?kh=-1&uddg=//g' "$destination_html_file" > "$destination_html_file".tmp &&
- mv "$destination_html_file".tmp "$destination_html_file" &> /dev/null &&
- echo "Postprocessing $se has succeeded ..." ||
- echo "Postprocessing $se has failed ..."
- fi
- # Links must have certain URL codes changed to ASCII or
- # clicking on them may fail.
- echo "Making essential URL ASCII conversions ..."
- sed -e 's/%3A/:/g' -e 's/%2F/\//g' "$destination_html_file" > "$destination_html_file".tmp &&
- mv "$destination_html_file".tmp "$destination_html_file" &> /dev/null &&
- echo "Conversion has succeeded ..." ||
- echo "Conversion has failed ..."
- done
- done
- }
- # INVOKE MAIN PROCEDURE
- main $@ && echo "Our work is done, exiting ..." || echo "Exiting, but there was some problem ..."
- # end of file
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement