Untitled

#!/usr/bin/env bash

output_dir=rm_booru_org_dump

mkdir -p $output_dir

# get total post count
total_post_count=$(curl 'https://rm.booru.org' \
    -H 'User-Agent: Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0' \
    -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' \
    -H 'Accept-Language: en-US,en;q=0.5' \
    -H 'Connection: keep-alive' \
    -H 'Upgrade-Insecure-Requests: 1' \
    -H 'Sec-Fetch-Dest: document' \
    -H 'Sec-Fetch-Mode: navigate' \
    -H 'Sec-Fetch-Site: none' \
    -H 'Sec-Fetch-User: ?1' \
    --compressed \
    --no-progress-meter \
    | grep -oE 'Serving .* posts' | grep -oE '[0-9]*,[0-9]*' | tr -d ',')

echo -e "Total post count = $total_post_count"

# iterate through posts
for post_num in `seq 1 $total_post_count` ; do
    echo -e "Post $post_num / $total_post_count"

    # try to download post page
    curl "https://rm.booru.org/index.php?page=post&s=view&id=$post_num" \
    -H 'User-Agent: Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0' \
    -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' \
    -H 'Accept-Language: en-US,en;q=0.5' \
    -H 'Connection: keep-alive' \
    -H 'Upgrade-Insecure-Requests: 1' \
    -H 'Sec-Fetch-Dest: document' \
    -H 'Sec-Fetch-Mode: navigate' \
    -H 'Sec-Fetch-Site: none' \
    -H 'Sec-Fetch-User: ?1' \
    --no-progress-meter \
    --compressed > /tmp/rmbooru_post_page.html

    # post is not deleted
    if [ -f /tmp/rmbooru_post_page.html ] ; then
        if [ -s /tmp/rmbooru_post_page.html ] ; then

            # parse image link
            img_link=$(cat /tmp/rmbooru_post_page.html | grep -oE 'https://img\.booru\.org\/rm.*\" ' | awk '{print $1}' | tr -d '"')

            # form paths for image and tags
            img_name=$(basename $img_link)
            img_path="$output_dir"/"$img_name"
            img_tags_name=$img_name"_tags.txt"
            img_tags_path="$output_dir"/$img_tags_name

            # parse and replace tags
            cat /tmp/rmbooru_post_page.html | grep -F '<title>Rozen Maiden' | sed 's/\t<title>Rozen Maiden - //' | sed 's/<\/title>//' | tr ' ' '\n' > "$img_tags_path"

            # check if image is downloaded
            if [ -f "$img_path" ] ; then
                echo -e "Image is already downloaded"
            else
                echo -e "Downloading image"
                # download image
                curl "$img_link" \
                     -H 'User-Agent: Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0' \
                     -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' \
                     -H 'Accept-Language: en-US,en;q=0.5' \
                     -H 'Connection: keep-alive' \
                     -H 'Upgrade-Insecure-Requests: 1' \
                     -H 'Sec-Fetch-Dest: document' \
                     -H 'Sec-Fetch-Mode: navigate' \
                     -H 'Sec-Fetch-Site: none' \
                     -H 'Sec-Fetch-User: ?1' \
                     --no-progress-meter \
                     --compressed > "$img_path"
            fi
        fi
    fi
    rm /tmp/rmbooru_post_page.html

done