fant0men

parse_movies.sh

Sep 20th, 2019 (edited)
104
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/bin/bash
  2. # Parse each line of a list of movie filenames, to get the
  3. # movie name and year.
  4. # Search for the movie on IMDb.
  5. # Parse the results, and add them to a list.
  6.  
  7. # TODO:
  8. # Add all the lines that couldn't be parsed or found on IMDb in a
  9. # separate text file.
  10.  
  11. list="${HOME}/find_movies-123.txt"
  12.  
  13. # Declares the $imdb associative array (hash), which will store all the
  14. # IMDb id:s that have been processed, to prevent the same id from being
  15. # processed twice.
  16. declare -A imdb
  17.  
  18. # This creates a function called 'uriencode', which will translate
  19. # the special characters in any string to be URL friendly. This will be
  20. # used in the 'imdb' function.
  21. uriencode () {
  22.     curl -Gso /dev/null -w %{url_effective} --data-urlencode @- "" <<<"${@}" | sed -E 's/..(.*).../\1/'
  23. }
  24.  
  25. # This creates a function called 'break_name', which will break up
  26. # the input filename, and parse it, to extract the movie name, and year.
  27. break_name () {
  28. # Sets $bname to the first argument passed to this function.
  29.     bname="$1"
  30.  
  31.     declare -a name
  32.  
  33.     temp=$(grep -Eo "^.*[0-9]{4}([[:punct:]]|[[:space:]]){1}" <<<"$bname" | sed 's/^[[:space:]]*\[.*\][[:space:]]*//')
  34.  
  35. # If $temp can't be parsed, set it to the input filename instead,
  36. # although limit the string by 64 characters, and remove possible
  37. # trailing whitespace from the string.
  38.     if [[ -z $temp ]]; then
  39.         temp=$(sed 's/ *$//' <<<"${bname:0:64}")
  40.     fi
  41.  
  42. # Break $bname up in a list of words, and store those words in arrays,
  43. # depending on whether $bname is separated by dots, hyphens, underscores
  44. # or spaces.
  45.     mapfile -d'.' -t bname_dots <<<"$temp"
  46.     mapfile -d'-' -t bname_hyphens <<<"$temp"
  47.     mapfile -d'_' -t bname_underscores <<<"$temp"
  48.     mapfile -d' ' -t bname_spaces <<<"$temp"
  49.  
  50. # Declares an associative array (hash), that stores the element numbers
  51. # for each kind of word separator: dots, hyphens, underscores, spaces.
  52.     declare -A bname_elements
  53.     bname_elements[dots]=${#bname_dots[@]}
  54.     bname_elements[hyphens]=${#bname_hyphens[@]}
  55.     bname_elements[underscores]=${#bname_underscores[@]}
  56.     bname_elements[spaces]=${#bname_spaces[@]}
  57.  
  58. # If there are more dots in $bname than hyphens, underscores or spaces,
  59. # that means $bname is separated by dots. Otherwise, it's separated by
  60. # hyphens, underscores or spaces. In either case, loop through the word
  61. # list in either array, and break the name up in separate words. The
  62. # last element is the year, so do a regex on that to filter out other
  63. # characters besides four digits.
  64.  
  65.     elements=0
  66.  
  67. # This for loop is to figure out if $bname is separated by dots,
  68. # hyphens, underscores or spaces.
  69.     for type in dots hyphens underscores spaces; do
  70.         temp_number="bname_elements[${type}]"
  71.  
  72.         if [[ ${!temp_number} -gt $elements ]]; then
  73.             elements="${!temp_number}"
  74.             temp_type="$type"
  75.         fi
  76.     done
  77.  
  78.     elements=$(( elements - 2 ))
  79.  
  80. # This for loop is to go through the word list. The last element is the
  81. # year, so do a regex on that to filter out other characters besides
  82. # four digits.
  83.     for (( i = 0; i <= $elements; i++ )); do
  84. # Creates a reference, pointing to the $i element of the
  85. # 'bname_$temp_type' array.
  86.         array_ref="bname_${temp_type}[${i}]"
  87.  
  88.         name[${i}]=$(tr -d '[:space:]' <<<"${!array_ref}")
  89.  
  90.         if [[ $i -eq elements ]]; then
  91.             year=$(grep -Eo "([[:punct:]]|[[:space:]])*[0-9]{4}([[:punct:]]|[[:space:]])*$" <<<"${!array_ref}" | tr -d '[:punct:]')
  92.  
  93. # If the $year variable is set, use it, otherwise use '0000' as the
  94. # year, adding it as the last element to the $name array.
  95.             if [[ $year ]]; then
  96.                 name[${i}]="(${year})"
  97.             else
  98.                 name+=('(0000)')
  99.             fi
  100.         fi
  101.     done
  102.  
  103. # Echoes the complete parsed name.
  104.     echo "${name[@]}"
  105. }
  106.  
  107. imdb () {
  108. #   agent='Lynx/2.8.9rel.1 libwww-FM/2.14 SSL-MM/1.4.1 OpenSSL/1.1.1d'
  109.     agent='Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
  110.  
  111.     curl_w_args () {
  112.         curl --location --user-agent "${agent}" --retry 10 --retry-delay 10 --connect-timeout 10 --silent "${1}"
  113.     }
  114.  
  115. # https://www.krazyworks.com/imdb-lookup-script/
  116. #IMDB is one of the most complete movie and television databases on the Internet. It offers an advanced Web search UI, but Unix CLI support has been discontinued years ago and the old scripts no longer work. There are a few nice third-party scripts out there for searching IMDB. Unfortunately, most of them are not maintained and won’t work right without some tweaking.
  117.  
  118. #Here’s my simplistic version that should be a breeze to maintain and it will provide you with the basics: movie description and rating. You need to feed it the movie title and release year like so: “The Godfather (1972)”.
  119.  
  120. #You can also leave the year out and just enter “The Godfather”. This will use Google search instead for the imdb.com domain. Depending on the uniqueness of the title, the accuracy may drop. Additionally, running too many Google queries in a short period of time may cause Google to temporarily bad your IP or start prompting for captcha.
  121.  
  122. #Originally I wrote this to go through a list of movie titles and help me figure what’s worth watching. No special dependencies apart from lynx, but that’s easy enough to install. Save the following as /usr/bin/imdb and run like so:
  123.  
  124. # 2019-09-21: Modified to run inside a function, and output only basic information.
  125. # 2019-09-21: Changed the $tmpfile variable, to save the temp file in /dev/shm, instead of /tmp.
  126. # This is faster, because /dev/shm is a RAM disk. It also keeps the root drive from being written to unnecessarily.
  127. # It's common for SSDs to be used as a root drive, and they wear out easily if constantly written to.
  128. # 2019-10-04: Modified to use curl instead of lynx.
  129. # 2019-10-06: Added a part to the if statement in 'get_imdb', so it still works even if $id is empty.
  130. # 2019-10-06: Added an if statement which checks if the year is '0000', which means that the year is undefined, and then it searches for all movies with that title, no matter the year the movie was made.
  131. # 2019-10-06: Changed the regex:es in 'parse_imdb', to take into account that some titles on IMDb say "Movie (TV Movie 1999)", and the like.
  132. # 2019-10-06: Added categories to include in the IMDb search results. Now it includes: Feature film, TV movie, TV special, Documentary, Video.
  133. # 2019-10-07: Changed the $y and $t variables. Now the $t variable relies on a new function called 'uriencode', which will encode special characters to their URL counterparts.
  134.  
  135.     if [[ $# -eq 0 ]]; then
  136.         echo 'Usage: imdb "Movie Title (Year)"'
  137.         exit 1
  138.     else
  139.         y=$(grep -Eo "\([0-9]{4}\)$" <<<"${@}" | tr -d '[:punct:]')
  140.         t=$(uriencode "$(sed 's/ ([0-9]\{4\})$//' <<<"${@}")")
  141.     fi
  142.  
  143.     configure() {
  144.         tmpfile="/dev/shm/imdb-mf_${RANDOM}.tmp"
  145.     }
  146.  
  147.     cleanup() {
  148.         if [[ -f ${tmpfile} ]]; then
  149.             rm -f "${tmpfile}"
  150.         fi
  151.     }
  152.  
  153.     get_imdb() {
  154. # Sets the type of IMDb search results to include.
  155.         type='feature,tv_movie,tv_special,documentary,video'
  156.  
  157. # If the year is set to '0000', that means it's unknown, hence we will
  158. # need to use slightly different URLs, when searching for the movie.
  159.  
  160.         if [[ $y == '0000' ]]; then
  161.             url_tmp="https://www.imdb.com/search/title/?title=${t}&title_type=${type}&view=simple"
  162. #           url_tmp2="https://www.google.com/search?q=${t}+site:imdb.com"
  163.         else
  164.             url_tmp="https://www.imdb.com/search/title/?title=${t}&title_type=${type}&release_date=${y},${y}&view=simple"
  165. #           url_tmp2="https://www.google.com/search?q=${t}+${y}+site:imdb.com"
  166.         fi
  167.  
  168.         id=$(curl_w_args "${url_tmp}" | grep -Eo "<a href=\"/title/tt[0-9]{4,}/" | grep -Eo "tt[0-9]{4,}" | head -n 1)
  169. #       id2=$(curl_w_args "${url_tmp2}" | grep -Eo "https://www.imdb.com/title/tt[0-9]{4,}" | grep -Eo "tt[0-9]{4,}" | head -n 1)
  170. # Disabled $id2 because Google is blocking cURL now.
  171.  
  172. # In case IMDb and Google give different IMDb IDs, use the one from the
  173. # Google search results. If $id is empty, but $id2 isn't, use $id2.
  174. # Else, use $id.
  175.  
  176.         if [[ $id && $id2 && $id != $id2 ]]; then
  177.             url="https://www.imdb.com/title/${id2}/"
  178.         elif [[ -z $id && $id2 ]]; then
  179.             url="https://www.imdb.com/title/${id2}/"
  180.         else
  181.             url="https://www.imdb.com/title/${id}/"
  182.         fi
  183.  
  184. # Exports the content of $url to $tmpfile.
  185.         curl -o "${tmpfile}" --location --user-agent "${agent}" --retry 10 --retry-delay 10 --connect-timeout 10 --silent "$url" 2>&-
  186.     }
  187.  
  188.     parse_imdb() {
  189.         if [[ ! -f $tmpfile ]]; then
  190.             return
  191.         fi
  192.  
  193.         full=$(grep -Eo "<meta property='og:title' content=\".* \(.*[0-9]{4}\) - IMDb\"" "${tmpfile}" | cut -d'"' -f2 | sed 's/ - IMDb$//')
  194.  
  195.         title=$(sed -E 's/ \(.*[0-9]{4}\)$//' <<<"$full")
  196.         year=$(grep -Eo "[0-9]{4}\)$" <<<"$full" | tr -d '[:punct:]')
  197.  
  198. # Saving these because the regex:es might be useful in the future.
  199. #       temp=$(grep "og:description" "${tmpfile}" | sed -e 's/content="/@/g' -e 's/" \/>/@/g' -e 's/\&quot;/\"/g' | cut -d'@' -f2)
  200. #       director=$(echo "${temp}" | cut -d'.' -f1 | sed 's/^Directed by //')
  201. #       cast=$(echo "${temp}" | cut -d'.' -f2 | sed 's/^ *//')
  202. #       plot=$(echo "${temp}" | cut -d'.' -f3 | sed 's/^ *//')
  203. #       rating=$(grep -i "ratingValue" "${tmpfile}" | head -n 1 | cut -d'"' -f4)
  204.     }
  205.  
  206.     print_imdb() {
  207.         if [[ $id ]]; then
  208.             id_tmp="$id"
  209.         elif [[ $id2 ]]; then
  210.             id_tmp="$id2"
  211.         fi
  212.  
  213.         if [[ $id_tmp && ${imdb[${id_tmp}]} -ne 1 ]]; then
  214.             imdb[${id_tmp}]=1
  215.             printf "%s (%s): (%s)\n" "$title" "$year" "$url"
  216.         fi
  217.     }
  218.  
  219.     # RUNTIME
  220.  
  221.     configure
  222.     cleanup
  223.     get_imdb
  224.     parse_imdb
  225.     print_imdb
  226.     cleanup
  227. }
  228.  
  229. cat "$list" | while read line; do
  230.     name=$(break_name "$line")
  231.  
  232.     imdb "$name"
  233. done
  234.  
RAW Paste Data