fant0men

Parsing HTML (modifying a siterip)

Aug 14th, 2020 (edited)
97
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/bin/bash
  2. # This script is meant to fix the links in the HTML files of my BetaArchive
  3. # forum siterip (done with wget).
  4.  
  5. # This is the command I used to make a local mirror of a forum thread:
  6.  
  7. # page=0; for n in {1..45}; do wget -EHkp -e robots=off --waitretry=10 --tries=10 --user-agent='Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36' "www.betaarchive.com/forum/viewtopic.php?f=61&t=33250&start=${page}"; (( page = page + 25 )); done
  8.  
  9. dir="${HOME}/betaarchive"
  10.  
  11. if [[ ! -d "$dir" ]]; then
  12.     exit
  13. fi
  14.  
  15. cd "$dir"
  16.  
  17. mapfile -t files < <(find "${dir}/www.betaarchive.com/forum" -maxdepth 1 -type f -iname "*.html" | sort)
  18.  
  19. image_regex='<a href=\"https{0,1}://www\.betaarchive\.com/.*\.(png|jpe{0,1}g|gif)\" class=\"postlink\"'
  20. page_regex='href=\"https{0,1}://www\.betaarchive\.com/forum/viewtopic\.php'
  21. page_regex2='\?f=[0-9]+\&amp\;t=[0-9]+\&amp\;sid=[[:alnum:]]+\&amp\;start=[0-9]+\"'
  22. page_regex3='\"\.\&\#x2F\;viewtopic\.php\&\#x3F\;f\&\#x3D\;[0-9]+\&amp\;amp\&\#x3B\;t\&\#x3D\;[0-9]+\&amp\;amp\&\#x3B\;sid\&\#x3D\;[[:alnum:]]+\"'
  23. page_regex4='href=\"viewtopic\.php\%3Ff=[0-9]+\&amp\;t=[0-9]+\&amp\;start=[0-9]+\.html\#([a-z]+){0,1}\"'
  24. page_regex5='\"viewtopic\.php\?f=[0-9]+\&amp\;t=[0-9]+\&amp\;sid=[[:alnum:]]+\"'
  25. page_regex6='href=\"viewtopic\.php\?p=[0-9]+\&amp\;sid=[[:alnum:]]+\#p[0-9]+\"'
  26. page_regex7='\"viewtopic\.php\?f=[0-9]+\&amp\;t=[0-9]+\&amp\;start=[0-9]+\&amp\;sid=[[:alnum:]]+\"'
  27. #<a href="viewtopic.php?f=61&amp;t=33250&amp;start=25&amp;sid=3e8af540222e2e1ce3a623dc88c577c5">Windows NT 4 Source Compiled! version 2</a>
  28.  
  29. for (( i = 0; i < ${#files[@]}; i++ )); do
  30.     f="${files[${i}]}"
  31.     f_dn=$(dirname "$f")
  32.  
  33.     num=$(sed -E "s|.*viewtopic\.php.*start=([0-9]+).html$|\1|" <<<"$f")
  34.     new_f=$(sed -E "s|viewtopic\.php.*start=([0-9]+).html$|page\1\.html|" <<<"$f")
  35.  
  36.     mapfile -t html <"$f"
  37.  
  38.     for (( j = 0; j < ${#html[@]}; j++ )); do
  39.         line="${html[${j}]}"
  40.  
  41.         if [[ $line =~ $image_regex ]]; then
  42.             get_html=$(sed -E 's/^.*(https{0,1}:\/\/www\.betaarchive\.com\/.*(\.png|jpe{0,1}g|gif))\" class=\"postlink\".*$/\1/I' <<<"$line")
  43.             html[${j}]=$(sed -E 's/https{0,1}:\/\/www\.betaarchive\.com(\/.*(\.png|jpe{0,1}g|gif))/\.\.\1/I' <<<"$line")
  44. #           wget -x -e robots=off --waitretry=10 --tries=10 --user-agent='Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36' "$get_html"
  45.             echo "${html[${j}]}"
  46.         fi
  47.  
  48.         if [[ $line =~ $page_regex ]]; then
  49.             html[${j}]=$(sed -E 's|(.*)https{0,1}:\/\/www\.betaarchive\.com\/forum\/(.*)|\1\2|' <<<"$line")
  50.  
  51.             if [[ $line =~ $page_regex2 ]]; then
  52.                 page_tmp=$(sed -E "s|.*viewtopic\.php\?f=[0-9]+\&.*t=[0-9]+\&.*start=([0-9]+).*|\1|" <<<"${html[${j}]}")
  53.                 html[${j}]=$(sed -E "s|viewtopic\.php(\?f=[0-9]+\&.*t=[0-9]+\&.*start=[0-9]+)|page${page_tmp}\.html\1|" <<<"${html[${j}]}")
  54.             fi
  55.  
  56.             if [[ ${html[${j}]} =~ $page_regex6 ]]; then
  57.                 html[${j}]=$(sed -E "s|\"viewtopic\.php(\?p=[0-9]+\&amp\;sid=[[:alnum:]]+\#p[0-9]+)\"|\"page${num}\.html\1\"|" <<<"${html[${j}]}")
  58.             fi
  59.  
  60.             if [[ ${html[${j}]} =~ $page_regex7 ]]; then
  61.                 html[${j}]=$(sed -E "s|\"viewtopic\.php(\?f=[0-9]+\&amp\;t=[0-9]+\&amp\;start=[0-9]+\&amp\;sid=[[:alnum:]]+)\"|\"page${num}\.html\1\"|" <<<"${html[${j}]}")
  62.             fi
  63.  
  64.             echo "${html[${j}]}"
  65.         fi
  66.  
  67. #       if [[ $line =~ $page_regex3 ]]; then
  68. #           html[${j}]=$(sed -E "s|\.\&\#x2F\;viewtopic\.php(\&\#x3F\;f\&\#x3D\;[0-9]+\&amp\;amp\&\#x3B\;t\&\#x3D\;[0-9]+\&amp\;amp\&\#x3B\;sid\&\#x3D\;[[:alnum:]]+)|page${num}\.html\1|" <<<"${html[${j}]}")
  69. #           echo "${html[${j}]}"
  70. #       fi
  71.  
  72.         if [[ ${html[${j}]} =~ $page_regex4 ]]; then
  73.             html[${j}]=$(sed -E "s|viewtopic\.php\%3Ff=[0-9]+\&.*t=[0-9]+\&.*start=([0-9]+).html(\#([a-z]+){0,1})|page\1\.html\2|" <<<"${html[${j}]}")
  74.             echo "${html[${j}]}"
  75.         fi
  76.  
  77.         if [[ ${html[${j}]} =~ $page_regex5 ]]; then
  78.             html[${j}]=$(sed -E "s|\"viewtopic\.php(\?f=[0-9]+\&amp\;t=[0-9]+\&amp\;sid=[[:alnum:]]+)\"|\"page0\.html\1\"|" <<<"${html[${j}]}")
  79.             echo "${html[${j}]}"
  80.         fi
  81.     done
  82.  
  83.     rm "$f" || exit
  84.  
  85.     if [[ -f $new_f ]]; then
  86.         exit
  87.     fi
  88.  
  89.     touch "$new_f" || exit
  90.  
  91.     for (( j = 0; j < ${#html[@]}; j++ )); do
  92.         echo "${html[${j}]}" >> "$new_f"
  93.     done
  94. done
  95.  
  96.  
RAW Paste Data