Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/bin/bash
- # 4chan Thread Archiver
- # Download a fully functional offline copy of a 4chan thread
- # Usage: ./archiver.sh [URL] [OutputDir] [Interval in s] [thumbnails/full 0/1] [only download once = s]
- # Depends: bash, wget
- # License: shove it up your arse for all I care
- # happy colours
- GR='\033[38;5;82m' ; PN='\033[38;5;171m' ; BL='\033[38;5;27m' ; RD='\033[0;31m' ; PR='\033[0;35m' ; CY='\033[0;36m' ; NC='\033[0m'
- # ${}
- # help screen
- if [[ ! $1 || "$1" == "-h" || "$1" == "--help" ]]; then
- echo -e "\n${GR}4chan Thread Archiver${NC}\nDownload a fully functional offline copy of a 4chan thread\n\nUsage: ./${GR}Archiver.sh${NC} [URL] ${BL}[OutputDir]${NC} ${PR}[update interval[s]]${NC} ${CY}[thumbnails/full 0/1]${NC} ${PN}[s for snapshot]${NC}\nDefaults: ./${GR}Archiver.sh${NC} [none] ${BL}[name of thread]${NC} ${PR}[120]${NC} ${CY}[1]${NC}\ninput nothing or d to set a variable to default\nentering s as the ${PN}5th argument${NC} will download the thread once\n\nExample: ./${GR}Archiver.sh${NC} https://boards.4channel.org/a/thread/197436238 ${BL}d${NC} ${PR}60${NC} ${CY}0${NC}\n${GR}saves thread${NC} 197436238 to the ${BL}default directory${NC} with an ${PR}update interval of 60 seconds${NC} and it will ${CY}only save thumbnails${NC}.\n"
- exit
- fi
- # Check for wget
- if [[ ! $(wget -h) ]]; then
- echo "This script requires wget. $sudo apt-get install wget or whatever equivalent of that on your outlandish distro."
- exit
- fi
- # enter a custom directory from where the commands will be executed
- Path="/mnt/j/User/bilder/aImportant/Screencaps/a/Archived_threads/"
- URL="$1"
- OutputDir="$2"
- Interval=$3
- Images="$4"
- Snapshot="$5"
- # check if thread is available
- if [[ ! $(wget -q -O - $URL) ]]; then
- echo -e "\n${RD}Thread not found${NC}\n"
- exit
- else
- echo -e "\nThread found\n"
- fi
- # replace empty arguments with default values
- if [[ ! $2 || "$2" == "d" ]]; then
- OutputDir=$(wget -q -nv -O - "$URL" | grep -oE "<title>.*</title>" | sed 's/<title>//' | sed 's/<\/title>//' | sed 's/[[:punct:]]//g' | sed $'s/[^[:print:]\t]//g' | sed -e 's/ /_/g' | sed -e 's/__4chan//')
- fi
- if [[ ! $3 || "$3" == "d" ]]; then
- Interval="120"
- fi
- if [[ ! $4 || $4 -ge "1" || "$4" == "d" ]]; then
- Hosts="i.4cdn.org,is2.4chan.org,s.4cdn.org"
- else
- Hosts="i.4cdn.org --accept-regex s.jpg$"
- fi
- # confirm user input
- # unnecessarily convoluted string edit commands
- echo -e "current working directory: \n$Path"
- echo -e "Thread ${GR}\"$(wget -q -nv -O - "$URL" | grep -oE "<title>.*</title>" | sed 's/<title>//' | sed 's/<\/title>//' | grep -oE "\- .* \-" | sed 's/\- //' | sed 's/\0x20 \-//')\"${NC} will be downloaded to ${BL}$OutputDir${NC}\n"
- confirm=3
- while [ $confirm -ge 0 ]; do
- echo -ne "press ctrl+C to cancel $confirm\033[0K\r"
- sleep 1
- : $((confirm--))
- done
- echo -e "\n\npreparing download"
- # if custom save directory was specified, execute wget from there
- if [[ $Path ]]; then
- cd $Path
- fi
- # create folder
- if [[ ! -d "$OutputDir" ]]; then
- mkdir "$OutputDir"
- echo "created folder $OutputDir"
- fi
- # attempt to dowload stylesheets and js
- while [[ ! -f ${OutputDir}"/yotsubanew.692.css" ]]; do
- wget -P $OutputDir -nd -r -l 1 -H -D i.4cdn.org,is2.4chan.org,s.4cdn.org -R gif,webm,png,jpg -p -k -N -c --adjust-extension -q $URL
- done
- echo -e "downloaded stylesheets\n\ncommencing download of thread\n"
- # define download function
- DownloadThread () {
- # the wget command: recursive on one level (-r -l 1) across specified hosts (-H -D),
- # get all required files (-p) replace all links to files with local file path (-k),
- # only download new files (-N), continue interrupted downloads (-c).
- # -nd for no subdirectories, -nv for less output
- wget -P $OutputDir -nd -nv -r -l 1 -H -D $Hosts --reject-regex robots.txt -p -k -N -c --adjust-extension $URL
- # fix mismatched extensions in the html
- cd "$OutputDir"
- find -type f -name \*.html | sed 's/jpg.html/jpg/g' | sed 's/png.html/png/g' | sed 's/webm.html/webm/g' | sed 's/gif.html/gif/g' | sed 's/txt.html/txt/g' | sed 's/txt.tmp.html/txt/g'
- cd ..
- }
- # if userinput $5 was "s", divert path to single download
- if [[ $5 == "s" ]]; then
- echo "saving snapshot"
- DownloadThread
- echo -e "\n${GR}Download completed.${NC}\n"
- exit
- fi
- # download thread at an interval until the cycle after archived.gif was downloaded (the thread was archived)
- while [[ ! -f ${OutputDir}"/archived.gif" ]]; do
- # check if thread still exists
- if [[ ! $(wget -q -O - $URL) ]]; then
- echo -e "\n${RD}Thread was 404'd. Probably mods being faggots. go throw them a complaint over at irc.rizon.net/4chan\nLiveArchiver stopped.${NC}\nUnfortunately, this might have broken some of the image links and I am unable to get sed to cooperate, so you'll have to open the html with a text editor and search and replace all instances and variations of https://i.4cdn.org/[BOARD]/ for the following domains:\n i.4cdn.org\n is2.4chan.org\n s.4cdn.org\nwith blank for the archived html to properly display images.\nI apologise for the inconvinience."
- exit
- fi
- DownloadThread
- echo ""
- cntdwn=$(($Interval))
- while [ $cntdwn -ge 0 ]; do
- echo -ne "Cycle completed, sleeping for $cntdwn seconds\033[0K\r"
- sleep 1
- : $((cntdwn--))
- done
- echo ""
- done
- echo -e "\n${GR}Thread was Archived.\nDownload has been completed. LiveArchiver stopped.${NC}\n"
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement