fant0men

Parse vetusware.com page ranges

Aug 20th, 2020
63
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/bin/bash
  2. # This is just a temporary script to parse a range of pages from vetusware.com
  3.  
  4. clear
  5.  
  6. out_f="${HOME}/vetusware_links-${RANDOM}.txt"
  7. out_f_sort="${HOME}/vetusware_links-sorted-${RANDOM}.txt"
  8.  
  9. touch "$out_f" "$out_f_sort"
  10.  
  11. regex_type='<p class=\"item_type\"><a href=\".*\">.*</a>.*<a href=\".*\">.*</a></p>.*<p class=\"item_lang\">.*</p>'
  12. regex_desc='<p class=\"item_description\">.*</p>.*<p><a href=\"https://vetusware.com/download/.*\">download</a></p>'
  13. #regex_link='<p><a href=\"https://vetusware.com/download/.*\">download</a></p>'
  14. n_line=0
  15.  
  16. for n in {1..30}; do
  17.     tmp_f="/dev/shm/vetusware_p${n}-${RANDOM}.txt"
  18.  
  19.     curl -s -o "$tmp_f" "https://vetusware.com/category/OS/?cat=1&page=${n}"
  20.  
  21.     while read line; do
  22.         (( n_line = n_line + 1 ))
  23.  
  24.         if [[ $line =~ $regex_type ]]; then
  25.             line_name=$(( n_line + 1 ))
  26.             line_desc=$(( n_line + 2 ))
  27.  
  28.             sed -Ei "${line_name}s|^.*<h3>.*<strong>(.*)</strong>.*</h3>.*$|name: \1|" "$tmp_f"
  29.  
  30.             line_desc_tmp=$(sed -n "${line_desc} p" "$tmp_f")
  31.  
  32.             if [[ $line_desc_tmp =~ $regex_desc ]]; then
  33.                 sed -Ei "${line_desc}s|^.*<p class=\"item_description\">(.*)</p>.*<p><a href=\"(https://vetusware.com/download/.*)\">download</a></p>.*$|desc: \1 link: \2|" "$tmp_f"
  34.             else
  35.                 sed -Ei "${line_desc}s|^.*<p><a href=\"(https://vetusware.com/download/.*)\">download</a></p>.*$|link: \1|" "$tmp_f"
  36.             fi
  37.  
  38.             sed -n "${line_name},${line_desc} p" "$tmp_f" >> "$out_f"
  39.         fi
  40.     done <"$tmp_f"
  41.  
  42.     rm "$tmp_f" || exit
  43.     n_line=0
  44. done
  45.  
  46. regex_name='^name: '
  47. regex_desc='^desc: '
  48. n_line=0
  49.  
  50. while read line; do
  51.     (( n_line = n_line + 1 ))
  52.  
  53.     if [[ $line =~ $regex_name ]]; then
  54.         line_desc=$(( n_line +1 ))
  55.  
  56.         sed -n "${n_line},${line_desc} p" "$out_f" | grep -iqF -e source -e dos -e 'os/2' -e windows
  57.  
  58.         if [[ $? -eq 0 ]]; then
  59.             sed -n "${n_line},${line_desc} p" "$out_f" >> "$out_f_sort"
  60.             echo >> "$out_f_sort"
  61.         fi
  62.     fi
  63. done <"$out_f"
  64.  
  65. rm "$out_f" || exit
  66.  
RAW Paste Data