Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import System.Text
- import System.Text.RegularExpressions
- import System.Web.Script.Serialization
- import AlbumArtDownloader.Scripts
- import util
- class ImageInfo:
- public pageUrl as String
- public imageList as List[Image]
- class Image:
- public url as String
- public id as String
- public width as int
- public height as int
- //Inheritors should override the Suffix property to return a valid amazon suffix (like com, co.uk, de, etc...).
- abstract class Amazon(AlbumArtDownloader.Scripts.IScript):
- private highresResults as Hash
- virtual Name as string:
- get: return "Amazon (.${Suffix})"
- Version as string:
- get: return "0.8s"
- Author as string:
- get: return "Alex Vallat"
- abstract protected Suffix as string:
- get: pass
- virtual protected CountryCode as string:
- get: return "01"
- virtual protected SearchIndex as string: //Deprectated, ignored.
- get: return ""
- virtual protected def GetUrl(artist as string, album as string) as string:
- return "http://www.amazon.${Suffix}/gp/search?search-alias=popular&field-artist=${EncodeUrl(artist, PageEncoding)}&field-keywords=${EncodeUrl(album, PageEncoding)}&sort=relevancerank"
- virtual protected PageEncoding as Encoding:
- get: return Encoding.GetEncoding("iso-8859-1")
- def Search(artist as string, album as string, results as IScriptResults):
- highresResults = {}
- artist = StripCharacters("&.'\";:?!", artist)
- album = StripCharacters("&.'\";:?!", album)
- url = GetUrl(artist, album)
- resultsPage = GetPage(GetPageStream(url, null, true), PageEncoding)
- resultsRegex = Regex("<div\\s[^>]*class\\s*=\\s*\"title\"[^>]*>\\s*<a\\s[^>]*href\\s*=\\s*\"(?<url>[^\"]+?/dp/(?<id>[^/]+)/)[^>]+>\\s*(?<title>.*?)</a>(?:\\s*<span\\s[^>]*class=\"ptBrand\"[^>]*>(?:[^<]*<a\\s[^>]*>)?\\s*(?:by |von |de |di )?(?<artist>[^<]+))?", RegexOptions.Singleline | RegexOptions.IgnoreCase)
- resultsMatches = resultsRegex.Matches(resultsPage)
- results.EstimatedCount = resultsMatches.Count
- json = JavaScriptSerializer()
- for resultsMatch as Match in resultsMatches:
- id = resultsMatch.Groups["id"].Value
- url = resultsMatch.Groups["url"].Value
- title = System.Web.HttpUtility.HtmlDecode(resultsMatch.Groups["title"].Value)
- artist = System.Web.HttpUtility.HtmlDecode(resultsMatch.Groups["artist"].Value)
- imageBase = "http://ecx.images-amazon.com/images/P/${id}.${CountryCode}."
- thumbnail = TryGetImageStream(imageBase + "_THUMB_")
- highresResults[imageBase] = true
- results.Add(thumbnail, "${artist} - ${title}", url, -1, -1, imageBase, CoverType.Front)
- count = 0
- for resultsMatch as Match in resultsMatches:
- // We hit a page for each result. Searches on Amazon should generally return the
- // item that was searched for quickly if it's going to be found at all, so don't
- // hammer the server.
- count++
- if count > 5:
- break
- id = resultsMatch.Groups["id"].Value
- url = resultsMatch.Groups["url"].Value
- title = System.Web.HttpUtility.HtmlDecode(resultsMatch.Groups["title"].Value)
- artist = System.Web.HttpUtility.HtmlDecode(resultsMatch.Groups["artist"].Value)
- imageBase = "http://ecx.images-amazon.com/images/P/${id}.${CountryCode}."
- images_url = "http://www.amazon.co.${Suffix}/gp/customer-media/product-gallery/${id}"
- imagesPage = GetPage(GetPageStream(images_url, null, true), PageEncoding)
- jsonRegex = Regex('var state = (?<json>{[^;]*});', RegexOptions.Multiline)
- for jsonDataMatch as Match in jsonRegex.Matches(imagesPage):
- jsonData = jsonDataMatch.Groups["json"].Value
- // amazon.co.jp uses double-width backslashes when escaping JS strings. No, really.
- jsonData = Regex("\").Replace(jsonData, "\\")
- result = json.Deserialize[of ImageInfo](jsonData)
- for image as ImageInfo.Image in result.imageList:
- thumbnail_url = image.url
- thumbnail_url = Regex("\\.jpg$").Replace(thumbnail_url, "._SX120_.jpg")
- results.Add(thumbnail_url, "${artist} - ${title}",
- images_url + "?currentImageID=${image.id}", image.width, image.height,
- image.url, CoverType.Front)
- // This one contains secondary official images. (Why are these separate pages?) This
- // is disabled; this will double the number of hits to the server, and most of the better
- // images are user images anyway.
- // images_url = "http://www.amazon.co.${Suffix}/gp/product/images/${id}"
- // imagesPage = GetPage(GetPageStream(images_url, null, true), PageEncoding)
- // customerImageRegex = Regex('fetchImage\\("alt_image_(?<idx>[0-9]+)", "(?<url>http[^"]+)"')
- // imageMatches = customerImageRegex.Matches(imagesPage)
- // for productMatch as Match in imageMatches:
- // image_url = productMatch.Groups["url"].Value
- // idx = productMatch.Groups["idx"].Value
- // results.Add(image_url, "${artist} - ${title}", url + "?img=" + idx, -1, -1, image_url, CoverType.Front)
- def RetrieveFullSizeImage(imageBase):
- if not highresResults.ContainsKey(imageBase):
- return TryGetImageStream(imageBase)
- imageStream = TryGetImageStream(imageBase + "_SCRM_")
- if imageStream != null:
- return imageStream
- //Fall back on Large size
- return TryGetImageStream(imageBase + "_SCL_")
- def TryGetImageStream(url):
- request as System.Net.HttpWebRequest = System.Net.HttpWebRequest.Create(url)
- try:
- response = request.GetResponse()
- if response.ContentLength > 43:
- return response.GetResponseStream()
- response.Close()
- return null
- except e as System.Net.WebException:
- return null
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement