Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/bin/bash
- # This script is meant to fix the links in the HTML files of my BetaArchive
- # forum siterip (done with wget).
- # This is the command I used to make a local mirror of a forum thread:
- # page=0; for n in {1..45}; do wget -EHkp -e robots=off --waitretry=10 --tries=10 --user-agent='Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36' "www.betaarchive.com/forum/viewtopic.php?f=61&t=33250&start=${page}"; (( page = page + 25 )); done
- dir="${HOME}/betaarchive"
- if [[ ! -d "$dir" ]]; then
- exit
- fi
- cd "$dir"
- mapfile -t files < <(find "${dir}/www.betaarchive.com/forum" -maxdepth 1 -type f -iname "*.html" | sort)
- image_regex='<a href=\"https{0,1}://www\.betaarchive\.com/.*\.(png|jpe{0,1}g|gif)\" class=\"postlink\"'
- page_regex='href=\"https{0,1}://www\.betaarchive\.com/forum/viewtopic\.php'
- page_regex2='\?f=[0-9]+\&\;t=[0-9]+\&\;sid=[[:alnum:]]+\&\;start=[0-9]+\"'
- page_regex3='\"\.\&\#x2F\;viewtopic\.php\&\#x3F\;f\&\#x3D\;[0-9]+\&\;amp\&\#x3B\;t\&\#x3D\;[0-9]+\&\;amp\&\#x3B\;sid\&\#x3D\;[[:alnum:]]+\"'
- page_regex4='href=\"viewtopic\.php\%3Ff=[0-9]+\&\;t=[0-9]+\&\;start=[0-9]+\.html\#([a-z]+){0,1}\"'
- page_regex5='\"viewtopic\.php\?f=[0-9]+\&\;t=[0-9]+\&\;sid=[[:alnum:]]+\"'
- page_regex6='href=\"viewtopic\.php\?p=[0-9]+\&\;sid=[[:alnum:]]+\#p[0-9]+\"'
- page_regex7='\"viewtopic\.php\?f=[0-9]+\&\;t=[0-9]+\&\;start=[0-9]+\&\;sid=[[:alnum:]]+\"'
- #<a href="viewtopic.php?f=61&t=33250&start=25&sid=3e8af540222e2e1ce3a623dc88c577c5">Windows NT 4 Source Compiled! version 2</a>
- for (( i = 0; i < ${#files[@]}; i++ )); do
- f="${files[${i}]}"
- f_dn=$(dirname "$f")
- num=$(sed -E "s|.*viewtopic\.php.*start=([0-9]+).html$|\1|" <<<"$f")
- new_f=$(sed -E "s|viewtopic\.php.*start=([0-9]+).html$|page\1\.html|" <<<"$f")
- mapfile -t html <"$f"
- for (( j = 0; j < ${#html[@]}; j++ )); do
- line="${html[${j}]}"
- if [[ $line =~ $image_regex ]]; then
- get_html=$(sed -E 's/^.*(https{0,1}:\/\/www\.betaarchive\.com\/.*(\.png|jpe{0,1}g|gif))\" class=\"postlink\".*$/\1/I' <<<"$line")
- html[${j}]=$(sed -E 's/https{0,1}:\/\/www\.betaarchive\.com(\/.*(\.png|jpe{0,1}g|gif))/\.\.\1/I' <<<"$line")
- # wget -x -e robots=off --waitretry=10 --tries=10 --user-agent='Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36' "$get_html"
- echo "${html[${j}]}"
- fi
- if [[ $line =~ $page_regex ]]; then
- html[${j}]=$(sed -E 's|(.*)https{0,1}:\/\/www\.betaarchive\.com\/forum\/(.*)|\1\2|' <<<"$line")
- if [[ $line =~ $page_regex2 ]]; then
- page_tmp=$(sed -E "s|.*viewtopic\.php\?f=[0-9]+\&.*t=[0-9]+\&.*start=([0-9]+).*|\1|" <<<"${html[${j}]}")
- html[${j}]=$(sed -E "s|viewtopic\.php(\?f=[0-9]+\&.*t=[0-9]+\&.*start=[0-9]+)|page${page_tmp}\.html\1|" <<<"${html[${j}]}")
- fi
- if [[ ${html[${j}]} =~ $page_regex6 ]]; then
- html[${j}]=$(sed -E "s|\"viewtopic\.php(\?p=[0-9]+\&\;sid=[[:alnum:]]+\#p[0-9]+)\"|\"page${num}\.html\1\"|" <<<"${html[${j}]}")
- fi
- if [[ ${html[${j}]} =~ $page_regex7 ]]; then
- html[${j}]=$(sed -E "s|\"viewtopic\.php(\?f=[0-9]+\&\;t=[0-9]+\&\;start=[0-9]+\&\;sid=[[:alnum:]]+)\"|\"page${num}\.html\1\"|" <<<"${html[${j}]}")
- fi
- echo "${html[${j}]}"
- fi
- # if [[ $line =~ $page_regex3 ]]; then
- # html[${j}]=$(sed -E "s|\.\&\#x2F\;viewtopic\.php(\&\#x3F\;f\&\#x3D\;[0-9]+\&\;amp\&\#x3B\;t\&\#x3D\;[0-9]+\&\;amp\&\#x3B\;sid\&\#x3D\;[[:alnum:]]+)|page${num}\.html\1|" <<<"${html[${j}]}")
- # echo "${html[${j}]}"
- # fi
- if [[ ${html[${j}]} =~ $page_regex4 ]]; then
- html[${j}]=$(sed -E "s|viewtopic\.php\%3Ff=[0-9]+\&.*t=[0-9]+\&.*start=([0-9]+).html(\#([a-z]+){0,1})|page\1\.html\2|" <<<"${html[${j}]}")
- echo "${html[${j}]}"
- fi
- if [[ ${html[${j}]} =~ $page_regex5 ]]; then
- html[${j}]=$(sed -E "s|\"viewtopic\.php(\?f=[0-9]+\&\;t=[0-9]+\&\;sid=[[:alnum:]]+)\"|\"page0\.html\1\"|" <<<"${html[${j}]}")
- echo "${html[${j}]}"
- fi
- done
- rm "$f" || exit
- if [[ -f $new_f ]]; then
- exit
- fi
- touch "$new_f" || exit
- for (( j = 0; j < ${#html[@]}; j++ )); do
- echo "${html[${j}]}" >> "$new_f"
- done
- done
Add Comment
Please, Sign In to add comment