Advertisement
Guest User

Untitled

a guest
Sep 28th, 2021
84
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Bash 3.37 KB | None | 0 0
  1. #!/usr/bin/env bash
  2.  
  3. image_dir="rm_booru_org_dump/images"
  4. tags_dir="rm_booru_org_dump/tags"
  5.  
  6. mkdir -p "$image_dir"
  7. mkdir -p "$tags_dir"
  8.  
  9. # get total post count
  10. total_post_count=$(curl 'https://rm.booru.org' \
  11.     -H 'User-Agent: Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0' \
  12.     -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' \
  13.     -H 'Accept-Language: en-US,en;q=0.5' \
  14.     -H 'Connection: keep-alive' \
  15.     -H 'Upgrade-Insecure-Requests: 1' \
  16.     -H 'Sec-Fetch-Dest: document' \
  17.     -H 'Sec-Fetch-Mode: navigate' \
  18.     -H 'Sec-Fetch-Site: none' \
  19.     -H 'Sec-Fetch-User: ?1' \
  20.     --compressed \
  21.     --no-progress-meter \
  22.     | grep -oE 'Serving .* posts' | grep -oE '[0-9]*,[0-9]*' | tr -d ',')
  23.  
  24. echo -e "Total post count = $total_post_count"
  25.  
  26. # iterate through posts
  27. for post_num in `seq 1 $total_post_count` ; do
  28.     echo -e "Post $post_num / $total_post_count"
  29.  
  30.     # try to download post page
  31.     curl "https://rm.booru.org/index.php?page=post&s=view&id=$post_num" \
  32.     -H 'User-Agent: Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0' \
  33.     -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' \
  34.     -H 'Accept-Language: en-US,en;q=0.5' \
  35.     -H 'Connection: keep-alive' \
  36.     -H 'Upgrade-Insecure-Requests: 1' \
  37.     -H 'Sec-Fetch-Dest: document' \
  38.     -H 'Sec-Fetch-Mode: navigate' \
  39.     -H 'Sec-Fetch-Site: none' \
  40.     -H 'Sec-Fetch-User: ?1' \
  41.     --no-progress-meter \
  42.     --compressed > /tmp/rmbooru_post_page.html
  43.    
  44.     # post is not deleted
  45.     if [ -f /tmp/rmbooru_post_page.html ] ; then
  46.         if [ -s /tmp/rmbooru_post_page.html ] ; then
  47.  
  48.             # parse image link
  49.             img_link=$(cat /tmp/rmbooru_post_page.html | grep -oE 'https://img\.booru\.org\/rm.*\" ' | awk '{print $1}' | tr -d '"')
  50.  
  51.             # form paths for image and tags
  52.             img_name=$(basename $img_link)
  53.             img_path="$image_dir"/"$img_name"
  54.             img_tags_name=$img_name"_tags.txt"
  55.             img_tags_path="$tags_dir"/$img_tags_name
  56.  
  57.             # parse and replace tags
  58.             cat /tmp/rmbooru_post_page.html | tr ' ' '\n' | grep -oE 'tags=.*\"' | tr -d '"' | tr -d "\'" | grep -vF 'tags=all' | grep -vF 'tags=+my_tags' | sed 's/tags=//g' > "$img_tags_path"
  59.        
  60.             # check if image is downloaded
  61.             if [ -f "$img_path" ] ; then
  62.                 echo -e "Image is already downloaded"
  63.             else
  64.                 echo -e "Downloading image"
  65.                 # download image
  66.                 curl "$img_link" \
  67.                      -H 'User-Agent: Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0' \
  68.                      -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' \
  69.                      -H 'Accept-Language: en-US,en;q=0.5' \
  70.                      -H 'Connection: keep-alive' \
  71.                      -H 'Upgrade-Insecure-Requests: 1' \
  72.                      -H 'Sec-Fetch-Dest: document' \
  73.                      -H 'Sec-Fetch-Mode: navigate' \
  74.                      -H 'Sec-Fetch-Site: none' \
  75.                      -H 'Sec-Fetch-User: ?1' \
  76.                      --no-progress-meter \
  77.                      --compressed > "$img_path"
  78.             fi
  79.         fi
  80.     fi
  81.     rm /tmp/rmbooru_post_page.html
  82.  
  83. done
  84.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement