Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env bash
- image_dir="rm_booru_org_dump/images"
- tags_dir="rm_booru_org_dump/tags"
- mkdir -p "$image_dir"
- mkdir -p "$tags_dir"
- # get total post count
- total_post_count=$(curl 'https://rm.booru.org' \
- -H 'User-Agent: Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0' \
- -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' \
- -H 'Accept-Language: en-US,en;q=0.5' \
- -H 'Connection: keep-alive' \
- -H 'Upgrade-Insecure-Requests: 1' \
- -H 'Sec-Fetch-Dest: document' \
- -H 'Sec-Fetch-Mode: navigate' \
- -H 'Sec-Fetch-Site: none' \
- -H 'Sec-Fetch-User: ?1' \
- --compressed \
- --no-progress-meter \
- | grep -oE 'Serving .* posts' | grep -oE '[0-9]*,[0-9]*' | tr -d ',')
- echo -e "Total post count = $total_post_count"
- # iterate through posts
- for post_num in `seq 1 $total_post_count` ; do
- echo -e "Post $post_num / $total_post_count"
- # try to download post page
- curl "https://rm.booru.org/index.php?page=post&s=view&id=$post_num" \
- -H 'User-Agent: Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0' \
- -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' \
- -H 'Accept-Language: en-US,en;q=0.5' \
- -H 'Connection: keep-alive' \
- -H 'Upgrade-Insecure-Requests: 1' \
- -H 'Sec-Fetch-Dest: document' \
- -H 'Sec-Fetch-Mode: navigate' \
- -H 'Sec-Fetch-Site: none' \
- -H 'Sec-Fetch-User: ?1' \
- --no-progress-meter \
- --compressed > /tmp/rmbooru_post_page.html
- # post is not deleted
- if [ -f /tmp/rmbooru_post_page.html ] ; then
- if [ -s /tmp/rmbooru_post_page.html ] ; then
- # parse image link
- img_link=$(cat /tmp/rmbooru_post_page.html | grep -oE 'https://img\.booru\.org\/rm.*\" ' | awk '{print $1}' | tr -d '"')
- # form paths for image and tags
- img_name=$(basename $img_link)
- img_path="$image_dir"/"$img_name"
- img_tags_name=$img_name"_tags.txt"
- img_tags_path="$tags_dir"/$img_tags_name
- # parse and replace tags
- cat /tmp/rmbooru_post_page.html | tr ' ' '\n' | grep -oE 'tags=.*\"' | tr -d '"' | tr -d "\'" | grep -vF 'tags=all' | grep -vF 'tags=+my_tags' | sed 's/tags=//g' > "$img_tags_path"
- # check if image is downloaded
- if [ -f "$img_path" ] ; then
- echo -e "Image is already downloaded"
- else
- echo -e "Downloading image"
- # download image
- curl "$img_link" \
- -H 'User-Agent: Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0' \
- -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' \
- -H 'Accept-Language: en-US,en;q=0.5' \
- -H 'Connection: keep-alive' \
- -H 'Upgrade-Insecure-Requests: 1' \
- -H 'Sec-Fetch-Dest: document' \
- -H 'Sec-Fetch-Mode: navigate' \
- -H 'Sec-Fetch-Site: none' \
- -H 'Sec-Fetch-User: ?1' \
- --no-progress-meter \
- --compressed > "$img_path"
- fi
- fi
- fi
- rm /tmp/rmbooru_post_page.html
- done
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement