oneeyedwillie

Improved 4chan image grabber script

Oct 21st, 2011
1,233
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/bin/sh
  2. # This is modified from the original at: http://pastebin.com/8zqRpKkY
  3.  
  4. clear;
  5.  
  6. DIR="$HOME/Pictures/4ch"; # Base download dir for pictures
  7. DELAY=10; # Number of seconds to wait before checking for new posts
  8. URL="";
  9. THREADS=8;
  10. ARGS=$#;
  11. THIS=`basename $0`;
  12.  
  13. # Exit codes for wget
  14. WEXITS[0]="No problems occurred.";
  15. WEXITS[1]="Generic error code.";
  16. WEXITS[2]="Parse error -- for instance, when parsing command-line options, the .wgetrc or .netrc …";
  17. WEXITS[3]="File I/O error.";
  18. WEXITS[4]="Network failure.";
  19. WEXITS[5]="SSL verification failure.";
  20. WEXITS[6]="Username/password authentication failure.";
  21. WEXITS[7]="Protocol errors.";
  22. WEXITS[8]="Server issued an error response.";
  23. WEXITS[9]="Unknown error code.";
  24.  
  25. ##
  26. # Checks to see if the input is an integer
  27. # Returns 1 if it is, and a 0 if it isn't
  28. ##
  29. is_int ()
  30. {
  31.     expr $1 % 1 &> /dev/null;
  32.     isint=$?;
  33.  
  34.     if [[ "$isint" -lt "2" ]]; then
  35.         return 1;
  36.     else
  37.         return 0;
  38.     fi
  39. }
  40.  
  41. ##
  42. # Prints out the command help and exits
  43. ##
  44. usage ()
  45. {
  46.     tput bold;
  47.     echo "NAME";
  48.     tput sgr0;
  49.     echo $THIS -- Download images from a 4chan thread...";
  50.     tput bold;
  51.     echo "SYNPOSIS";
  52.     tput sgr0;
  53.     echo $THIS [DEST] [THREADS] [DELAY] <URL>\n";
  54.     tput bold;
  55.     echo "DESCRIPTION";
  56.     tput sgr0;
  57.    
  58.     # using a variable for the description paragraph to make it easier to format..
  59.     # I hate horozontal scrolling when I code.
  60.     DESC="$THIS will download all images (\"THREADS\" simultaneously) from a post on 4chan at the URL you provide, and save them to \"DEST\", creating new folders as necessary. After it downloads the current images it will wait \"DELAY\" seconds and check for new images. If there are new images, it will grab them.";
  61.     echo $DESC" | fmt -s 66 66;
  62.    
  63.     echo "\n    $THIS will stop when you Ctrl+C or it receives a 404 from 4chan.
  64.    
  65.     DEFAULTS:
  66.         DEST:    $DIR/<post number>/
  67.         DELAY:   $DELAY
  68.         THREADS: $THREADS\n";
  69.    
  70.     echo "  NOTES:
  71.         Only URL is required unless you specify [THREADS] at which point you must also specify a [DELAY], and if you specify a [DEST] you also have to specify [THREADS] AND [DELAY]." | fmt -s 66 66;
  72.  
  73.     tput bold;
  74.     echo "EXAMPLES";
  75.     tput sgr0;
  76.  
  77.     echo "  Download from URL with default options…
  78.     `basename $0` http://boards.4chan.org/b/res/359372915
  79.  
  80.     Download from URL with a 15s delay before checking for new images…
  81.     `basename $0` 15 http://boards.4chan.org/b/res/359372915
  82.    
  83.     Download from URL using 8 threads, with a 15s delay before checking for new images…
  84.     `basename $0` 8 15 http://boards.4chan.org/b/res/359372915
  85.  
  86.     Download from URL using 8 threads, with a 15s delay before checking for new images. Saves the images in /Users/John/4chan…
  87.     `basename $0` \"/Users/John/4chan\" 8 15 http://boards.4chan.org/b/res/359372915";
  88.     exit 1;
  89. }
  90.  
  91. ##
  92. # Check for the proper number of args, and returns the usage summary if an incorrect number is provided.
  93. ##
  94. if [ "$ARGS" -lt 1 ] || [ "$ARGS" -gt 4 ]; then
  95.     usage;
  96. fi
  97.  
  98. ##
  99. # Set the correct arg to the correct var based on # of args provided…
  100. ##
  101. if [ "$ARGS" -eq 1 ]; then
  102.     URL="$1";
  103. elif [ "$ARGS" -eq 2 ]; then
  104.     URL="$2";
  105.     DELAY="$1";
  106. elif [ "$ARGS" -eq 3 ]; then
  107.     URL="$3";
  108.     DELAY="$2";
  109.     THREADS="$1";
  110. elif [ "$ARGS" -eq 4 ]; then
  111.     URL="$4";
  112.     DELAY="$3";
  113.     THREADS="$2";
  114.     DIR="$1";
  115. else
  116.     usage;
  117. fi
  118.  
  119.  
  120. `is_int $DELAY`;
  121. delint=$?;
  122.  
  123. `is_int $THREADS`;
  124. thrint=$?;
  125.  
  126. if [[ "$delint" -ne "1" ]] || [[ "$thrint" -ne "1" ]]; then
  127.     echo "\nDELAY and THREADS must both be integers!\n";
  128.     usage;
  129. fi
  130.  
  131. ##
  132. # Don't want no negative delays…
  133. ##
  134. if [ "$DELAY" -lt 0 ]; then
  135.     DELAY=0;
  136. fi
  137.  
  138. ##
  139. # Want at least one thread right?
  140. ##
  141. if [ "$THREADS" -lt 1 ]; then
  142.     THREADS=1;
  143. fi
  144.  
  145. SUBDIR=$( echo "$URL" | egrep -o '([0-9]*)$' | sed 's/\.html//g' );
  146. LOC="$DIR/$SUBDIR";
  147. LOGFILE="$LOC/log.txt";
  148.  
  149.  
  150. if [ ! -d $LOC ]; then
  151.         mkdir -p $LOC;
  152. fi
  153.  
  154. cd $LOC;
  155.  
  156. while [ true ]; do
  157.  
  158.     # Fuck you clutter!
  159.     clear;
  160.  
  161.     # Lets show some user-friendly info
  162.     echo "4Chan Image Downloader
  163.  
  164. URL:     $URL
  165. DELAY:   $DELAY
  166. THREADS: $THREADS
  167. DEST:    $LOC
  168.  
  169.  
  170. I'll download all images ($THREADS at a time) from $URL and save them to $LOC.
  171. I'll pause for $DELAY seconds before checking for new images.
  172. I'll stop when I see a CTRL+C from you, or a 404 from 4Chan.\n\n";
  173.  
  174.         TMP=`mktemp /tmp/4chan.XXXXXX`; # Holds the webpage
  175.         TMP2=`mktemp /tmp/4chanm.XXXXXX`; # Holds the links to all the images in the post
  176.         TMP3=`mktemp /tmp/4chanc.XXXXXX`; # Holds the links to all the images you DON'T have
  177.         WGET_CMD="wget -nv -nc -a $LOGFILE"; # holds the command+args for the 2nd wget, to pull an image
  178.  
  179.         echo "Grabbing the post at: $URL\n";
  180.         wget -q -O "$TMP" "$URL";
  181.         WEXIT=$?; # so we can check the exit status of wget for that 404
  182.        
  183.         # testing for an error from wget
  184.         if [ $WEXIT -ne 0 ]; then
  185.            
  186.             # remove the temp files we created
  187.             rm $TMP $TMP2;
  188.            
  189.             # wget exits with an 8 if the server
  190.             # sent an error response code .. like a 404 maybe? ;-)
  191.             if [ $WEXIT -eq 8 ]
  192.                 then
  193.                     echo "There's that 404 we're waiting for ... lets make like a tree and get out of here!";
  194.                 else
  195.                     if [ $WEXIT -gt 8 ]; then
  196.                         WEXIT=9;
  197.                     fi
  198.                     echo "Couldn't get it up ... sorry. Here's the error we're exiting with:\nERROR: ${WEXITS[$WEXIT]} ";
  199.             fi
  200.            
  201.             exit $WEXIT;
  202.         fi
  203.  
  204.         ##
  205.         # Strip all the unique image URLs from the page and put them in TMP2
  206.         ##
  207.         egrep 'http://images.4chan.org/[a-z0-9]+/src/([0-9]*).(jpg|jpeg|png|gif)' "$TMP" -o | uniq > "$TMP2";
  208.  
  209.         ##
  210.         # Check to see which ones are already downloaded and put "new" ones in TMP3
  211.         ##
  212.         echo "Looking at the images you have .. fap fap fap..\n";
  213.         for file in `cat $TMP2`
  214.         do
  215.           fpath="$LOC/`echo $file | egrep -o '([0-9]*).(jpg|jpeg|png|gif)'`";
  216.          
  217.           if [ ! -e "$fpath" ]; then
  218.             echo $file >> "$TMP3";
  219.           fi
  220.          
  221.         done
  222.        
  223.         totalImages=`cat $TMP2 | wc -l | tr -d ' '`;
  224.         newImages=`cat $TMP3 | wc -l | tr -d ' '`;
  225.         imageDiff=$(( totalImages - newImages ));
  226.        
  227.         echo "You have $imageDiff of $totalImages images in this post.";
  228.         if [[ "$newImages" -gt "0" ]]; then
  229.             echo "Downloading $newImages images...";
  230.             cat $TMP3 | xargs -P $THREADS -I _URL_ $WGET_CMD _URL_;
  231.             SAVED=`cat $LOGFILE | wc -l | tr -d ' '`;
  232.             echo "($SAVED/$newImages) images were successfully saved.\n";
  233.             rm $LOGFILE;
  234.         else
  235.             echo "There are no new images to download.\n";
  236.         fi
  237.  
  238.         fapTime=3;
  239.  
  240.         if (( ("$DELAY" - "$fapTime") < 0 )); then
  241.             realFaptime=0;
  242.             fapTime="$DELAY";
  243.         else
  244.             realFaptime=$(( DELAY - fapTime ));
  245.         fi
  246.  
  247.         echo "I'm going to fap to those images.\nDon't worry, it only takes me $realFaptime seconds.\n";
  248.         sleep $realFaptime;
  249.         echo "Ahhhh… Hold on; lemme clean up my mess.";
  250.  
  251.         rm $TMP $TMP2 $TMP3;
  252.         sleep $fapTime;
  253. done;
RAW Paste Data