Advertisement
Guest User

Untitled

a guest
Sep 17th, 2011
192
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.09 KB | None | 0 0
  1. import System.Text
  2. import System.Text.RegularExpressions
  3. import System.Web.Script.Serialization
  4. import AlbumArtDownloader.Scripts
  5. import util
  6.  
  7. class ImageInfo:
  8. public pageUrl as String
  9. public imageList as List[Image]
  10. class Image:
  11. public url as String
  12. public id as String
  13. public width as int
  14. public height as int
  15.  
  16. //Inheritors should override the Suffix property to return a valid amazon suffix (like com, co.uk, de, etc...).
  17. abstract class Amazon(AlbumArtDownloader.Scripts.IScript):
  18. private highresResults as Hash
  19.  
  20. virtual Name as string:
  21. get: return "Amazon (.${Suffix})"
  22. Version as string:
  23. get: return "0.8s"
  24. Author as string:
  25. get: return "Alex Vallat"
  26. abstract protected Suffix as string:
  27. get: pass
  28. virtual protected CountryCode as string:
  29. get: return "01"
  30. virtual protected SearchIndex as string: //Deprectated, ignored.
  31. get: return ""
  32. virtual protected def GetUrl(artist as string, album as string) as string:
  33. return "http://www.amazon.${Suffix}/gp/search/ref=sr_adv_m_pop/?search-alias=popular&field-artist=${EncodeUrlIsoLatin1(artist)}&field-title=${EncodeUrlIsoLatin1(album)}&sort=relevancerank"
  34. virtual protected PageEncoding as Encoding:
  35. get: return Encoding.GetEncoding("iso-8859-1")
  36.  
  37.  
  38. def Search(artist as string, album as string, results as IScriptResults):
  39. highresResults = {}
  40.  
  41. artist = StripCharacters("&.'\";:?!", artist)
  42. album = StripCharacters("&.'\";:?!", album)
  43.  
  44. url = GetUrl(artist, album)
  45. resultsPage = GetPage(GetPageStream(url, null, true), PageEncoding)
  46.  
  47. resultsRegex = Regex("<div\\s[^>]*class\\s*=\\s*\"title\"[^>]*>\\s*<a\\s[^>]*href\\s*=\\s*\"(?<url>[^\"]+?/dp/(?<id>[^/]+)/)[^>]+>\\s*(?<title>.*?)</a>(?:\\s*<span\\s[^>]*class=\"ptBrand\"[^>]*>(?:[^<]*<a\\s[^>]*>)?\\s*(?:by |von |de |di )?(?<artist>[^<]+))?", RegexOptions.Singleline | RegexOptions.IgnoreCase)
  48. resultsMatches = resultsRegex.Matches(resultsPage)
  49.  
  50. results.EstimatedCount = resultsMatches.Count
  51.  
  52. json = JavaScriptSerializer()
  53.  
  54. count = 0
  55. for resultsMatch as Match in resultsMatches:
  56. // We hit a page for each result. Searches on Amazon should generally return the
  57. // item that was searched for quickly if it's going to be found at all, so don't
  58. // hammer the server.
  59. count++
  60. if count > 5:
  61. break
  62.  
  63. id = resultsMatch.Groups["id"].Value
  64. url = resultsMatch.Groups["url"].Value
  65. title = System.Web.HttpUtility.HtmlDecode(resultsMatch.Groups["title"].Value)
  66. artist = System.Web.HttpUtility.HtmlDecode(resultsMatch.Groups["artist"].Value)
  67. imageBase = "http://ecx.images-amazon.com/images/P/${id}.${CountryCode}."
  68.  
  69. thumbnail = TryGetImageStream(imageBase + "_THUMB_")
  70. highresResults[imageBase] = true
  71. results.Add(thumbnail, "${artist} - ${title}", url, -1, -1, imageBase, CoverType.Front)
  72.  
  73. images_url = "http://www.amazon.co.jp/gp/customer-media/product-gallery/${id}"
  74. imagesPage = GetPage(GetPageStream(images_url, null, true), PageEncoding)
  75. jsonRegex = Regex('var state = (?<json>{[^;]*});', RegexOptions.Multiline)
  76. for jsonDataMatch as Match in jsonRegex.Matches(imagesPage):
  77. jsonData = jsonDataMatch.Groups["json"].Value
  78.  
  79. // amazon.co.jp uses double-width backslashes when escaping JS strings. No, really.
  80. jsonData = Regex("\").Replace(jsonData, "\\")
  81.  
  82. test = json.Deserialize[of ImageInfo](jsonData)
  83. if test.pageUrl is null:
  84. results.Add(thumbnail, "${artist} - ${title}", "null", -1, -1, url, CoverType.Front)
  85. continue
  86. for image as ImageInfo.Image in test.imageList:
  87. results.Add(image.url, "${artist} - ${title}",
  88. images_url + "?currentImageID=${image.id}", image.width, image.height,
  89. image.url, CoverType.Front)
  90.  
  91. // This one contains secondary official images. (Why are these separate pages?) This
  92. // is disabled; this will double the number of hits to the server, and most of the better
  93. // images are user images anyway.
  94. // images_url = "http://www.amazon.co.jp/gp/product/images/${id}"
  95. // imagesPage = GetPage(GetPageStream(images_url, null, true), PageEncoding)
  96. // customerImageRegex = Regex('fetchImage\\("alt_image_(?<idx>[0-9]+)", "(?<url>http[^"]+)"')
  97. // imageMatches = customerImageRegex.Matches(imagesPage)
  98. // for productMatch as Match in imageMatches:
  99. // image_url = productMatch.Groups["url"].Value
  100. // idx = productMatch.Groups["idx"].Value
  101. // results.Add(image_url, "${artist} - ${title}", url + "?img=" + idx, -1, -1, image_url, CoverType.Front)
  102.  
  103. def RetrieveFullSizeImage(imageBase):
  104. if not highresResults.ContainsKey(imageBase):
  105. return TryGetImageStream(imageBase)
  106.  
  107. imageStream = TryGetImageStream(imageBase + "_SCRM_")
  108. if imageStream != null:
  109. return imageStream
  110.  
  111. //Fall back on Large size
  112. return TryGetImageStream(imageBase + "_SCL_")
  113.  
  114. def TryGetImageStream(url):
  115. request as System.Net.HttpWebRequest = System.Net.HttpWebRequest.Create(url)
  116. try:
  117. response = request.GetResponse()
  118. if response.ContentLength > 43:
  119. return response.GetResponseStream()
  120.  
  121. response.Close()
  122. return null
  123. except e as System.Net.WebException:
  124. return null
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement