Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- import mechanize
- import re
- import urllib2
- import sys
- import os
- top_url = "http://musicmp3spb.org"
- def download_file(url, file_name):
- u = urllib2.urlopen(url)
- f = open(file_name, 'wb')
- meta = u.info()
- file_size = int(meta.getheaders("Content-Length")[0])
- print "Downloading: %s Bytes: %s" % (file_name, file_size)
- file_size_dl = 0
- block_sz = 8192
- while True:
- buffer = u.read(block_sz)
- if not buffer:
- break
- file_size_dl += len(buffer)
- f.write(buffer)
- status = r"%10d [%3.2f%%]" % (file_size_dl, file_size_dl * 100. / file_size)
- status = status + chr(8)*(len(status)+1)
- print status,
- f.close()
- def download_song(url, targetFolder):
- print "Downloading song from %s" % url
- browser = get_browser_with(url)
- formcount=0
- action_regexp = re.compile('/file/.*')
- for form in browser.forms():
- if action_regexp.match(form.attrs['action']):
- formcount=formcount+1
- break
- file_name = url + '.mp3'
- for link in browser.links(url_regex='\/file\/'):
- file_name = link.text
- browser.select_form(nr=formcount)
- browser.submit()
- for link in browser.links(url_regex='tempfile\.ru'):
- download_file(link.url, targetFolder + "/" + file_name)
- def download_mp3s(browser, targetFolder):
- print "\nMP3"
- title_regexp = re.compile('.*Скачать mp3.*', re.IGNORECASE)
- for link in browser.links(url_regex='/download/.*'):
- for title in [attr for attr in link.attrs if len(attr) == 2 and attr[0] == 'title']:
- title_utf8 = title[1].decode('cp1251').encode('utf8')
- if title_regexp.match(title_utf8):
- url_arr = link.base_url.split('/')
- songUrl = link.url
- if not link.url.startswith('http'):
- if len(url_arr) > 3 and url_arr[0] == 'http:':
- page_url = 'http://' + url_arr[2]
- else:
- page_url = url_arr[0]
- songUrl = page_url + link.url
- print songUrl
- download_song(songUrl, targetFolder)
- def download_covers(url, targetFolder):
- print "\nCOVERS"
- print "Downloading covers from %s" % url
- browser = get_browser_with(url)
- cover_links = list(browser.links(url_regex='\/images\/'))
- covers_target_folder = targetFolder
- if len(cover_links) > 4:
- covers_target_folder = targetFolder + "/" + "covers"
- if not os.path.exists(covers_target_folder):
- os.makedirs(covers_target_folder)
- print "There are %d covers, therefore they will be downloaded into subfolder %s" % (len(cover_links), covers_target_folder)
- counter = 0
- for link in cover_links:
- counter = counter + 1
- cover_url = top_url + link.url
- print cover_url
- file_name = cover_url.split("/")[-1]
- file_extension = cover_url.split(".")[-1]
- cover_number = "%s" % counter
- if counter < 10:
- cover_number = "0" + cover_number
- cover_target_name = covers_target_folder + "/cover_" + cover_number + "." + file_extension
- download_file(cover_url, cover_target_name)
- def get_folder_name_for_album(browser):
- album_and_artist = get_album_and_artist(browser.title())
- if len(album_and_artist) <> 2:
- print "Cannot format an album title automatically, sorry..."
- folder_name = str.join(",", album_and_artist)
- folder_name = remove_characters_illegal_in_paths(folder_name)
- print "Following folder name will be used: " + folder_name
- return folder_name
- year = get_release_year(browser, album_and_artist[0])
- folder_name = album_and_artist[1].strip() + " - " + year + " - " + album_and_artist[0].strip()
- return remove_characters_illegal_in_paths(folder_name)
- def remove_characters_illegal_in_paths(inputName):
- result = inputName
- for bad_character_in_path in '\\:*?"<>|/':
- result = result.replace(bad_character_in_path, '_')
- return result
- def get_album_and_artist(webPageTitle):
- noiseAtTheEnd = " mp3"
- if webPageTitle.endswith(noiseAtTheEnd):
- webPageTitle = webPageTitle[:-len(noiseAtTheEnd)]
- splitTitle = webPageTitle.split(",")
- return splitTitle
- def get_release_year(browser, albumTitle):
- htmlOfWebPage = browser.response().read()
- htmlSnippetBeforeYear = "<div class=\"Name\">\n" + albumTitle + "<br />\n<i>"
- positionOfAlbumTitleWithYear = htmlOfWebPage.find(htmlSnippetBeforeYear)
- if positionOfAlbumTitleWithYear < 0:
- return ""
- yearStartPosition = positionOfAlbumTitleWithYear + len(htmlSnippetBeforeYear)
- return "%s" % htmlOfWebPage[yearStartPosition:yearStartPosition+4]
- def get_covers_url(browser):
- covers_regex =re.compile('.*обложки.*', re.IGNORECASE)
- for link in browser.links(url_regex='/covers/.*'):
- return top_url + link.url
- return ""
- def get_browser_with(url):
- browser = mechanize.Browser()
- browser.set_handle_robots(False)
- browser.open(url)
- return browser
- def download_album(url):
- if not url.startswith(top_url):
- print "Wrong URL! Expected an url beginning with " + top_url
- return
- browser = get_browser_with(url)
- print "-------------------------------------------------------"
- print "Downloading an album from ..."
- print "Url: " + url
- targetFolder = get_folder_name_for_album(browser)
- print "Target folder: %s" % targetFolder
- if not os.path.exists(targetFolder):
- os.makedirs(targetFolder)
- else:
- print "Target folder already exists, cancelling download"
- return
- covers_url = get_covers_url(browser)
- if len(covers_url) > 0:
- download_covers(covers_url, targetFolder)
- download_mp3s(browser, targetFolder)
- print "...downloading album is finished."
- print "-------------------------------------------------------"
- def main():
- if len(sys.argv) < 2:
- print "usage %s album_url" % os.path.basename(sys.argv[0])
- sys.exit(1)
- download_album(sys.argv[1])
- if __name__ == "__main__":
- main()
Advertisement
Add Comment
Please, Sign In to add comment