Guest User

скрипт для скачивания целых альбомов с musicmp3spb.ru

a guest
Jul 3rd, 2015
583
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.31 KB | None | 0 0
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3.  
  4. import mechanize
  5. import re
  6. import urllib2
  7. import sys
  8. import os
  9.  
  10. top_url = "http://musicmp3spb.org"
  11.  
  12. def download_file(url, file_name):
  13.     u = urllib2.urlopen(url)
  14.     f = open(file_name, 'wb')
  15.     meta = u.info()
  16.     file_size = int(meta.getheaders("Content-Length")[0])
  17.     print "Downloading: %s Bytes: %s" % (file_name, file_size)
  18.  
  19.     file_size_dl = 0
  20.     block_sz = 8192
  21.     while True:
  22.         buffer = u.read(block_sz)
  23.         if not buffer:
  24.             break
  25.  
  26.         file_size_dl += len(buffer)
  27.         f.write(buffer)
  28.         status = r"%10d  [%3.2f%%]" % (file_size_dl, file_size_dl * 100. / file_size)
  29.         status = status + chr(8)*(len(status)+1)
  30.         print status,
  31.  
  32.     f.close()
  33.  
  34. def download_song(url, targetFolder):
  35.     print "Downloading song from %s" % url
  36.    
  37.     browser = get_browser_with(url)
  38.  
  39.     formcount=0
  40.     action_regexp = re.compile('/file/.*')
  41.     for form in browser.forms():
  42.         if action_regexp.match(form.attrs['action']):
  43.             formcount=formcount+1
  44.             break
  45.  
  46.     file_name = url + '.mp3'
  47.     for link in browser.links(url_regex='\/file\/'):
  48.         file_name = link.text
  49.  
  50.     browser.select_form(nr=formcount)
  51.     browser.submit()
  52.  
  53.     for link in browser.links(url_regex='tempfile\.ru'):
  54.         download_file(link.url, targetFolder + "/" + file_name)
  55.  
  56.  
  57. def download_mp3s(browser, targetFolder):
  58.     print "\nMP3"
  59.     title_regexp = re.compile('.*Скачать mp3.*', re.IGNORECASE)
  60.     for link in browser.links(url_regex='/download/.*'):
  61.         for title in [attr for attr in link.attrs if len(attr) == 2 and attr[0] == 'title']:
  62.             title_utf8 = title[1].decode('cp1251').encode('utf8')
  63.  
  64.             if title_regexp.match(title_utf8):
  65.                 url_arr = link.base_url.split('/')
  66.                 songUrl = link.url
  67.                 if not link.url.startswith('http'):
  68.                     if len(url_arr) > 3 and url_arr[0] == 'http:':
  69.                         page_url = 'http://' + url_arr[2]
  70.                     else:                    
  71.                         page_url = url_arr[0]
  72.                     songUrl = page_url + link.url
  73.                 print songUrl
  74.                 download_song(songUrl, targetFolder)
  75.  
  76. def download_covers(url, targetFolder):
  77.     print "\nCOVERS"
  78.     print "Downloading covers from %s" % url
  79.  
  80.     browser = get_browser_with(url)
  81.    
  82.     cover_links = list(browser.links(url_regex='\/images\/'))
  83.  
  84.     covers_target_folder = targetFolder
  85.     if len(cover_links) > 4:
  86.         covers_target_folder = targetFolder + "/" + "covers"
  87.         if not os.path.exists(covers_target_folder):
  88.             os.makedirs(covers_target_folder)
  89.         print "There are %d covers, therefore they will be downloaded into subfolder %s" % (len(cover_links), covers_target_folder)
  90.  
  91.     counter = 0
  92.     for link in cover_links:
  93.         counter = counter + 1
  94.         cover_url = top_url + link.url
  95.         print cover_url
  96.         file_name = cover_url.split("/")[-1]
  97.         file_extension = cover_url.split(".")[-1]
  98.         cover_number = "%s" % counter
  99.         if counter < 10:
  100.             cover_number = "0" + cover_number
  101.         cover_target_name = covers_target_folder + "/cover_" + cover_number + "." + file_extension
  102.         download_file(cover_url, cover_target_name)
  103.  
  104. def get_folder_name_for_album(browser):
  105.     album_and_artist = get_album_and_artist(browser.title())
  106.     if len(album_and_artist) <> 2:
  107.         print "Cannot format an album title automatically, sorry..."
  108.         folder_name = str.join(",", album_and_artist)
  109.         folder_name = remove_characters_illegal_in_paths(folder_name)
  110.         print "Following folder name will be used: " + folder_name
  111.         return folder_name
  112.     year = get_release_year(browser, album_and_artist[0])
  113.     folder_name = album_and_artist[1].strip() + " - " + year + " - " + album_and_artist[0].strip()
  114.     return remove_characters_illegal_in_paths(folder_name)
  115.  
  116. def remove_characters_illegal_in_paths(inputName):
  117.     result = inputName
  118.     for bad_character_in_path in '\\:*?"<>|/':
  119.         result = result.replace(bad_character_in_path, '_')
  120.     return result
  121.  
  122. def get_album_and_artist(webPageTitle):
  123.     noiseAtTheEnd = " mp3"
  124.     if webPageTitle.endswith(noiseAtTheEnd):
  125.         webPageTitle = webPageTitle[:-len(noiseAtTheEnd)]
  126.     splitTitle = webPageTitle.split(",")
  127.     return splitTitle
  128.  
  129. def get_release_year(browser, albumTitle):
  130.     htmlOfWebPage = browser.response().read()
  131.     htmlSnippetBeforeYear = "<div class=\"Name\">\n" + albumTitle + "<br />\n<i>"
  132.     positionOfAlbumTitleWithYear = htmlOfWebPage.find(htmlSnippetBeforeYear)
  133.     if positionOfAlbumTitleWithYear < 0:
  134.         return ""
  135.     yearStartPosition = positionOfAlbumTitleWithYear + len(htmlSnippetBeforeYear)
  136.     return "%s" % htmlOfWebPage[yearStartPosition:yearStartPosition+4]
  137.  
  138. def get_covers_url(browser):
  139.     covers_regex =re.compile('.*обложки.*', re.IGNORECASE)
  140.     for link in browser.links(url_regex='/covers/.*'):
  141.         return top_url + link.url
  142.     return ""
  143.  
  144. def get_browser_with(url):
  145.     browser = mechanize.Browser()
  146.     browser.set_handle_robots(False)
  147.     browser.open(url)
  148.     return browser
  149.    
  150. def download_album(url):
  151.    
  152.     if not url.startswith(top_url):
  153.         print "Wrong URL! Expected an url beginning with " + top_url
  154.         return
  155.    
  156.     browser = get_browser_with(url)
  157.    
  158.     print "-------------------------------------------------------"
  159.     print "Downloading an album from ..."
  160.     print "Url: " + url
  161.    
  162.     targetFolder = get_folder_name_for_album(browser)
  163.     print "Target folder: %s" % targetFolder
  164.     if not os.path.exists(targetFolder):
  165.         os.makedirs(targetFolder)
  166.     else:
  167.         print "Target folder already exists, cancelling download"
  168.         return
  169.    
  170.     covers_url = get_covers_url(browser)
  171.     if len(covers_url) > 0:
  172.         download_covers(covers_url, targetFolder)
  173.  
  174.     download_mp3s(browser, targetFolder)
  175.        
  176.     print "...downloading album is finished."
  177.     print "-------------------------------------------------------"
  178.  
  179. def main():
  180.     if len(sys.argv) < 2:
  181.         print "usage %s album_url" % os.path.basename(sys.argv[0])
  182.         sys.exit(1)
  183.  
  184.     download_album(sys.argv[1])
  185.  
  186. if __name__ == "__main__":
  187.     main()
Advertisement
Add Comment
Please, Sign In to add comment