SHARE
TWEET

search.sh

yojimbos_pastebin Jul 20th, 2018 (edited) 206 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/usr/bin/env bash
  2.  
  3. # search.sh
  4.  
  5. # INTRODUCTION
  6.  
  7. # Lines that are comments are prefaced with the symbol '#'.  This script uses
  8. # 'surfraw' to search through a list of terms that are of interest.  A file of
  9. # search terms may be given as a command line argument; otherwise the script
  10. # will look for the file named 'search_terms.txt' that must be located in the
  11. # local directory.
  12.  
  13. # The results are sent to a date and time stamped directory as identified by
  14. # search term and search engine that can be opened in any html compliant graphical
  15. # web browser as a local file.  The links so contained may be activated from
  16. # there.
  17.  
  18. # The procedure 'surfraw' was first written by Julian Assange and is today
  19. # maintained by others.  It contains (has built-in) 100+ engines called 'elvi'.
  20. # New 'elvi' may be added; all that is needed is the search url of a web site.
  21. # For example, for Zerohedge.com a search for 'Clinton 2020' generates the url:
  22.  
  23. # https://www.zerohedge.com/search-content?search_api_fulltext=Clinton+2020&sort_by=search_api_relevance
  24.  
  25. # This may be used to create a new 'elvi'.  Thus any web site or sites of interest
  26. # may be interrogated with this script if the search url can be obtained.  For more
  27. # information, see:
  28.  
  29. # https://www.techrepublic.com/blog/linux-and-open-source/how-to-write-your-own-elvi-for-surfraw/
  30.  
  31. # DEPENDENCIES
  32.  
  33. # bash, surfraw, lynx, sed and a graphical web browser (like firefox). Each of these
  34. # should be easily installed on any linux system or Mac OS (this has not been tested).
  35.  
  36. # FILE FORMATS
  37.  
  38. # Search terms are placed in a file, one term per line.  No empty lines are permitted.
  39. # For example the file 'search_terms.txt' may contain:
  40.  
  41. # Trump Putin Summit
  42. # Clinton 2020 campaign
  43. # Thailand Cave Rescue
  44. # Prime Minister May Brexit
  45. # Nigel Farage UKIP
  46.  
  47. # And, for example, the file 'search_engines.txt' may contain:
  48.  
  49. # dmoz
  50. # duckduckgo
  51. # google
  52. # bing
  53. # cnn
  54. # bbcnews
  55. # yandex
  56.  
  57. # The engine names contained by the file 'search_engines.txt' must have a corresponding
  58. # 'elvi' in 'surfraw'.
  59.  
  60. # LICENSE AND WARRANTY
  61.  
  62. # This is free for use or modification under the MIT License terms.  No warranty is
  63. # expressed or implied. No copyright is claimed at this time.
  64.  
  65. # INSTALLATION, USAGE AND CAUTIONS
  66.  
  67. # This file should be saved as 'search.sh'.  To make it executable, do 'chmod +x search.sh'.
  68. # Otherwise it may be run via the command 'bash search.sh'.  To use the executable form, do:
  69.  
  70. #   ./search.sh
  71. #
  72. # Or, when using a custom search term file name, do:
  73. #
  74. #   ./search.sh my_search_terms.txt
  75.  
  76. # The program will create a date and time stamped directory that produces one html page per
  77. # search term/search engine based upon the contents of the file 'search_terms.txt' and the
  78. # file 'search_engines.txt'.  Click on these to view the search results in a browser.
  79.  
  80. # A word of caution on usage.  Certain engines will block access if this procedure is used
  81. # too frequently.  Google in particular is on this list.  So if you find yourself locked out,
  82. # you have been warned.
  83.  
  84. # ENVIRONMENT AND GLOBALS
  85.  
  86. export SURFRAW_graphical='no'
  87. export SURFRAW_text_browser='lynx -source'
  88. export SURFRAW_results=100 # this may not work on all engines.
  89.  
  90. # PROCEDURES
  91.  
  92. main(){
  93.  
  94.     local destination_html_file=
  95.     local search_engines_file='search_engines.txt'
  96.     local search_terms_file='search_terms.txt'
  97.     local destination_folder="$(date +%Y-%m-%d\ %T)"
  98.  
  99.     # If a file name is supplied then use it.  Otherwise default to the file name
  100.     # 'search_terms.txt' that must be in the local directory.  If no file is found
  101.     # then exit.
  102.  
  103.     # First set the search terms file.
  104.     if [[ -n $1 ]]
  105.     then
  106.         search_terms_file="$1"
  107.     fi
  108.  
  109.     # Next make sure the search terms file exists.
  110.     if [[ ! -f $search_terms_file ]]
  111.     then
  112.         echo "Oops!  Could not find the file of search terms, exiting ..."
  113.         exit 1
  114.     fi
  115.  
  116.     # Next make sure the search engines file exists.
  117.     if [[ ! -f $search_engines_file ]]
  118.     then
  119.         echo "Oops!  Could not find the file of search engines, exiting ..."
  120.         exit 1
  121.     fi
  122.  
  123.     # Next create the search results output directory.
  124.     if [[ ! -d $destination_folder ]]
  125.     then
  126.         if mkdir "$destination_folder" &> /dev/null
  127.         then
  128.             echo "Creating directory $destination_folder ..."
  129.         else
  130.             echo "Oops!  Could not create search directory, exiting ..."
  131.             exit 1
  132.         fi
  133.     fi
  134.  
  135.     # Finally read the search terms/engines files into an array.
  136.     IFS=$'\n' read -d '' -r -a search_terms < "$search_terms_file"
  137.     IFS=$'\n' read -d '' -r -a search_engines < "$search_engines_file"
  138.  
  139.     # Now do the search and send the results to a time-stamped html file
  140.     # ordered by search term.
  141.     for st in "${search_terms[@]}"
  142.     do
  143.         for se in "${search_engines[@]}"
  144.         do
  145.  
  146.             destination_html_file="$destination_folder"'/'"$st"' ('"$se"').html'
  147.             surfraw $se $st > "$destination_html_file"
  148.  
  149.             # Duckduckgo has some strange prefixes on links; these must be
  150.             # removed.
  151.             if [[ $se == duckduckgo ]]
  152.             then
  153.                 echo "Postprocessing $se html output ..."
  154.                 sed -e 's/\/l\/?kh=-1&amp;uddg=//g' "$destination_html_file" > "$destination_html_file".tmp &&
  155.                 mv "$destination_html_file".tmp "$destination_html_file" &> /dev/null &&
  156.                     echo "Postprocessing $se has succeeded ..." ||
  157.                     echo "Postprocessing $se has failed ..."
  158.             fi
  159.  
  160.             # Links must have certain URL codes changed to ASCII or
  161.             # clicking on them may fail.
  162.             echo "Making essential URL ASCII conversions ..."
  163.             sed -e 's/%3A/:/g' -e 's/%2F/\//g' "$destination_html_file" > "$destination_html_file".tmp &&
  164.             mv "$destination_html_file".tmp "$destination_html_file" &> /dev/null &&
  165.                 echo "Conversion has succeeded ..." ||
  166.                 echo "Conversion has failed ..."
  167.  
  168.         done  
  169.     done
  170.  
  171. }
  172.  
  173. # INVOKE MAIN PROCEDURE
  174.  
  175. main $@ && echo "Our work is done, exiting ..." || echo "Exiting, but there was some problem ..."
  176.  
  177. # end of file
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top