Advertisement
Guest User

Untitled

a guest
Sep 26th, 2021
67
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Bash 3.28 KB | None | 0 0
  1. #!/usr/bin/env bash
  2.  
  3. output_dir=rm_booru_org_dump
  4.  
  5. mkdir -p $output_dir
  6.  
  7. # get total post count
  8. total_post_count=$(curl 'https://rm.booru.org' \
  9.     -H 'User-Agent: Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0' \
  10.     -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' \
  11.     -H 'Accept-Language: en-US,en;q=0.5' \
  12.     -H 'Connection: keep-alive' \
  13.     -H 'Upgrade-Insecure-Requests: 1' \
  14.     -H 'Sec-Fetch-Dest: document' \
  15.     -H 'Sec-Fetch-Mode: navigate' \
  16.     -H 'Sec-Fetch-Site: none' \
  17.     -H 'Sec-Fetch-User: ?1' \
  18.     --compressed \
  19.     --no-progress-meter \
  20.     | grep -oE 'Serving .* posts' | grep -oE '[0-9]*,[0-9]*' | tr -d ',')
  21.  
  22. echo -e "Total post count = $total_post_count"
  23.  
  24. # iterate through posts
  25. for post_num in `seq 1 $total_post_count` ; do
  26.     echo -e "Post $post_num / $total_post_count"
  27.  
  28.     # try to download post page
  29.     curl "https://rm.booru.org/index.php?page=post&s=view&id=$post_num" \
  30.     -H 'User-Agent: Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0' \
  31.     -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' \
  32.     -H 'Accept-Language: en-US,en;q=0.5' \
  33.     -H 'Connection: keep-alive' \
  34.     -H 'Upgrade-Insecure-Requests: 1' \
  35.     -H 'Sec-Fetch-Dest: document' \
  36.     -H 'Sec-Fetch-Mode: navigate' \
  37.     -H 'Sec-Fetch-Site: none' \
  38.     -H 'Sec-Fetch-User: ?1' \
  39.     --no-progress-meter \
  40.     --compressed > /tmp/rmbooru_post_page.html
  41.    
  42.     # post is not deleted
  43.     if [ -f /tmp/rmbooru_post_page.html ] ; then
  44.         if [ -s /tmp/rmbooru_post_page.html ] ; then
  45.  
  46.             # parse image link
  47.             img_link=$(cat /tmp/rmbooru_post_page.html | grep -oE 'https://img\.booru\.org\/rm.*\" ' | awk '{print $1}' | tr -d '"')
  48.  
  49.             # form paths for image and tags
  50.             img_name=$(basename $img_link)
  51.             img_path="$output_dir"/"$img_name"
  52.             img_tags_name=$img_name"_tags.txt"
  53.             img_tags_path="$output_dir"/$img_tags_name
  54.  
  55.             # parse and replace tags
  56.             cat /tmp/rmbooru_post_page.html | grep -F '<title>Rozen Maiden' | sed 's/\t<title>Rozen Maiden - //' | sed 's/<\/title>//' | tr ' ' '\n' > "$img_tags_path"
  57.        
  58.             # check if image is downloaded
  59.             if [ -f "$img_path" ] ; then
  60.                 echo -e "Image is already downloaded"
  61.             else
  62.                 echo -e "Downloading image"
  63.                 # download image
  64.                 curl "$img_link" \
  65.                      -H 'User-Agent: Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0' \
  66.                      -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' \
  67.                      -H 'Accept-Language: en-US,en;q=0.5' \
  68.                      -H 'Connection: keep-alive' \
  69.                      -H 'Upgrade-Insecure-Requests: 1' \
  70.                      -H 'Sec-Fetch-Dest: document' \
  71.                      -H 'Sec-Fetch-Mode: navigate' \
  72.                      -H 'Sec-Fetch-Site: none' \
  73.                      -H 'Sec-Fetch-User: ?1' \
  74.                      --no-progress-meter \
  75.                      --compressed > "$img_path"
  76.             fi
  77.         fi
  78.     fi
  79.     rm /tmp/rmbooru_post_page.html
  80.  
  81. done
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement