Advertisement
Guest User

Untitled

a guest
Sep 18th, 2011
180
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.46 KB | None | 0 0
  1. import System.Text
  2. import System.Text.RegularExpressions
  3. import System.Web.Script.Serialization
  4. import AlbumArtDownloader.Scripts
  5. import util
  6.  
  7. class ImageInfo:
  8. public pageUrl as String
  9. public imageList as List[Image]
  10. class Image:
  11. public url as String
  12. public id as String
  13. public width as int
  14. public height as int
  15.  
  16. //Inheritors should override the Suffix property to return a valid amazon suffix (like com, co.uk, de, etc...).
  17. abstract class Amazon(AlbumArtDownloader.Scripts.IScript):
  18. private highresResults as Hash
  19.  
  20. virtual Name as string:
  21. get: return "Amazon (.${Suffix})"
  22. Version as string:
  23. get: return "0.8s"
  24. Author as string:
  25. get: return "Alex Vallat"
  26. abstract protected Suffix as string:
  27. get: pass
  28. virtual protected CountryCode as string:
  29. get: return "01"
  30. virtual protected SearchIndex as string: //Deprectated, ignored.
  31. get: return ""
  32. virtual protected def GetUrl(artist as string, album as string) as string:
  33. return "http://www.amazon.${Suffix}/gp/search?search-alias=popular&field-artist=${EncodeUrl(artist, PageEncoding)}&field-keywords=${EncodeUrl(album, PageEncoding)}&sort=relevancerank"
  34. virtual protected PageEncoding as Encoding:
  35. get: return Encoding.GetEncoding("iso-8859-1")
  36.  
  37.  
  38. def Search(artist as string, album as string, results as IScriptResults):
  39. highresResults = {}
  40.  
  41. artist = StripCharacters("&.'\";:?!", artist)
  42. album = StripCharacters("&.'\";:?!", album)
  43.  
  44. url = GetUrl(artist, album)
  45. resultsPage = GetPage(GetPageStream(url, null, true), PageEncoding)
  46.  
  47. resultsRegex = Regex("<div\\s[^>]*class\\s*=\\s*\"title\"[^>]*>\\s*<a\\s[^>]*href\\s*=\\s*\"(?<url>[^\"]+?/dp/(?<id>[^/]+)/)[^>]+>\\s*(?<title>.*?)</a>(?:\\s*<span\\s[^>]*class=\"ptBrand\"[^>]*>(?:[^<]*<a\\s[^>]*>)?\\s*(?:by |von |de |di )?(?<artist>[^<]+))?", RegexOptions.Singleline | RegexOptions.IgnoreCase)
  48. resultsMatches = resultsRegex.Matches(resultsPage)
  49.  
  50. results.EstimatedCount = resultsMatches.Count
  51.  
  52. json = JavaScriptSerializer()
  53.  
  54. for resultsMatch as Match in resultsMatches:
  55. id = resultsMatch.Groups["id"].Value
  56. url = resultsMatch.Groups["url"].Value
  57. title = System.Web.HttpUtility.HtmlDecode(resultsMatch.Groups["title"].Value)
  58. artist = System.Web.HttpUtility.HtmlDecode(resultsMatch.Groups["artist"].Value)
  59. imageBase = "http://ecx.images-amazon.com/images/P/${id}.${CountryCode}."
  60.  
  61. thumbnail = TryGetImageStream(imageBase + "_THUMB_")
  62. highresResults[imageBase] = true
  63.  
  64. results.Add(thumbnail, "${artist} - ${title}", url, -1, -1, imageBase, CoverType.Front)
  65.  
  66. count = 0
  67. for resultsMatch as Match in resultsMatches:
  68. // We hit a page for each result. Searches on Amazon should generally return the
  69. // item that was searched for quickly if it's going to be found at all, so don't
  70. // hammer the server.
  71. count++
  72. if count > 5:
  73. break
  74.  
  75. id = resultsMatch.Groups["id"].Value
  76. url = resultsMatch.Groups["url"].Value
  77. title = System.Web.HttpUtility.HtmlDecode(resultsMatch.Groups["title"].Value)
  78. artist = System.Web.HttpUtility.HtmlDecode(resultsMatch.Groups["artist"].Value)
  79. imageBase = "http://ecx.images-amazon.com/images/P/${id}.${CountryCode}."
  80.  
  81. images_url = "http://www.amazon.co.${Suffix}/gp/customer-media/product-gallery/${id}"
  82. imagesPage = GetPage(GetPageStream(images_url, null, true), PageEncoding)
  83. jsonRegex = Regex('var state = (?<json>{[^;]*});', RegexOptions.Multiline)
  84. for jsonDataMatch as Match in jsonRegex.Matches(imagesPage):
  85. jsonData = jsonDataMatch.Groups["json"].Value
  86.  
  87. // amazon.co.jp uses double-width backslashes when escaping JS strings. No, really.
  88. jsonData = Regex("\").Replace(jsonData, "\\")
  89.  
  90. result = json.Deserialize[of ImageInfo](jsonData)
  91. for image as ImageInfo.Image in result.imageList:
  92. thumbnail_url = image.url
  93. thumbnail_url = Regex("\\.jpg$").Replace(thumbnail_url, "._SX120_.jpg")
  94.  
  95. results.Add(thumbnail_url, "${artist} - ${title}",
  96. images_url + "?currentImageID=${image.id}", image.width, image.height,
  97. image.url, CoverType.Front)
  98.  
  99. // This one contains secondary official images. (Why are these separate pages?) This
  100. // is disabled; this will double the number of hits to the server, and most of the better
  101. // images are user images anyway.
  102. // images_url = "http://www.amazon.co.${Suffix}/gp/product/images/${id}"
  103. // imagesPage = GetPage(GetPageStream(images_url, null, true), PageEncoding)
  104. // customerImageRegex = Regex('fetchImage\\("alt_image_(?<idx>[0-9]+)", "(?<url>http[^"]+)"')
  105. // imageMatches = customerImageRegex.Matches(imagesPage)
  106. // for productMatch as Match in imageMatches:
  107. // image_url = productMatch.Groups["url"].Value
  108. // idx = productMatch.Groups["idx"].Value
  109. // results.Add(image_url, "${artist} - ${title}", url + "?img=" + idx, -1, -1, image_url, CoverType.Front)
  110.  
  111. def RetrieveFullSizeImage(imageBase):
  112. if not highresResults.ContainsKey(imageBase):
  113. return TryGetImageStream(imageBase)
  114.  
  115. imageStream = TryGetImageStream(imageBase + "_SCRM_")
  116. if imageStream != null:
  117. return imageStream
  118.  
  119. //Fall back on Large size
  120. return TryGetImageStream(imageBase + "_SCL_")
  121.  
  122. def TryGetImageStream(url):
  123. request as System.Net.HttpWebRequest = System.Net.HttpWebRequest.Create(url)
  124. try:
  125. response = request.GetResponse()
  126. if response.ContentLength > 43:
  127. return response.GetResponseStream()
  128.  
  129. response.Close()
  130. return null
  131. except e as System.Net.WebException:
  132. return null
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement