silenius

Untitled

Feb 26th, 2015
229
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.02 KB | None | 0 0
  1. #!/bin/bash
  2.  
  3. # generate some helpfull information
  4. if [ "$1" == "help" ]; then
  5. echo "usage: call the script with at least 1 parameter:"
  6. echo "the board you want to dump"
  7. echo ""
  8. echo "the second parameter is optional and can be used"
  9. echo "to download the first 'x' pages (from 0 tot 15) default 0"
  10. echo "you can also provide all to dump them all"
  11. echo ""
  12. echo "examples:"
  13. echo "4chandump.sh p (downloads the first page of /p/)"
  14. echo "4chandump.sh p 4 (downloads page 0 through 4)"
  15. echo "4chandump.sh p all (downloads every page)"
  16. exit 1
  17. fi
  18. # TODO check for both arguments and default them to something if an argument is not provided
  19.  
  20. # parse the arguments
  21. if [ "$1" == "" ]; then
  22. echo error && exit
  23. else
  24. BOARD="$1"
  25. fi
  26.  
  27. if [ "$2" == "all" ]; then
  28. PAGES="15"
  29. elif [ "$2" == "" ]; then
  30. PAGES="15"
  31. else
  32. PAGES="$2"
  33. fi
  34.  
  35. # ikno, dirty but whatevs
  36. if [ ! -d $BOARD ]; then
  37. mkdir $BOARD
  38. fi
  39.  
  40. cd $BOARD
  41.  
  42. # remove old temp files
  43. rm urls.tmp
  44. rm posts.tmp
  45. rm imageurls.tmp
  46. rm 4chan.html
  47.  
  48. # get the html file
  49. for (( p = 0; p <= $PAGES; p++ ))
  50. do
  51. echo "http://boards.4chan.org/$BOARD/$p" >> urls.tmp
  52. done
  53.  
  54. wget -i urls.tmp -O 4chan.html -nv -nc
  55.  
  56. # parse the html file to get all the threads
  57. cat 4chan.html | sed s/\</\\n\</g | grep "View Thread" | sed 's/.*\(res\/[0-9]*\).*/\1/g' | sed "s/\(.*\)/http:\/\/boards.4chan.org\/$BOARD\/\1/g" | uniq > posts.tmp
  58.  
  59. # wget all the files and output them to 1 single file
  60. wget -i posts.tmp -O 4chan.html -nv -nc
  61.  
  62. # parse all the files to get the image urls
  63. cat 4chan.html | sed s/\</\\n\</g | grep ".*\.jpg$" | grep "a href" | sed 's/.*>\([0-9]*\.[jpgs][pniw][fg]\).*/\1/' | sed "s/\(.*\)/http:\/\/images\.4chan\.org\/$BOARD\/src\/\1/" | uniq > imageurls.tmp
  64.  
  65.  
  66.  
  67. # wget ALL DEM IMAGES
  68. wget -i imageurls.tmp -nc
  69.  
  70. # zip all them shit and send them via email
  71. #zip images.zip *
  72. # remove the images after being zipped, takes up a lot of space so...
  73. #rm images/*
  74.  
  75. # remove temp files
  76.  
  77. rm urls.tmp
  78. rm posts.tmp
  79. rm imageurls.tmp
  80. rm 4chan.html
  81.  
  82. cd ..
Advertisement
Add Comment
Please, Sign In to add comment