Guest User

Untitled

a guest
Jul 18th, 2018
74
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 10.35 KB | None | 0 0
  1. #!/usr/bin/env python
  2. """:mod:`magic_image_crawler` --- Magic Image Crawler!
  3. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  4.  
  5. This small program crawls the URL and download only most important images of
  6. the page. Trivial images like buttons, banners are ignored.
  7.  
  8. """
  9. import re
  10. import itertools
  11. import urlparse
  12. import urllib2
  13. import tempfile
  14. import os.path
  15. import lxml.etree
  16. import lxml.html
  17. import Image
  18. import ImageChops
  19.  
  20.  
  21. #: The :class:`~lxml.etree.XPath` that finds image URLs.
  22. IMG_SRC_XPATH = lxml.etree.XPath("//img[@src]/@src")
  23.  
  24. #: The default minimum width/height.
  25. DEFAULT_MINAREA = 200 * 250
  26.  
  27. #: The default tolerence for treating background pixels.
  28. DEFAULT_BGTOLERANCE = 10
  29.  
  30.  
  31. def image_urls(url, root_url=None):
  32. """Finds only important image URLs from the given ``url``.
  33.  
  34. :param url: the URL of the page to find
  35. :type url: :class:`basestring`
  36. :param root_url: an optional URL of the home
  37. :type root_url: :class:`basestring`
  38. :returns: a list of found image URLs
  39. :rtype: :class:`list`
  40.  
  41. """
  42. doc = lxml.html.parse(url)
  43. charset = doc.xpath("//meta[@charset]/@charset")
  44. if charset:
  45. charset = charset[0]
  46. else:
  47. ns = lxml.etree.FunctionNamespace(None)
  48. ns["lower-case"] = lambda dummy, seq: seq[0].lower() if seq else None
  49. charset = doc.xpath("//meta[lower-case(@http-equiv)='content-type']"
  50. "/@content")
  51. if charset:
  52. m = re.search(r";\s*charset\s*=\s*(\S+)\s*", charset[0], re.I)
  53. charset = m.group(1) if m else "utf-8"
  54. else:
  55. charset = "utf-8"
  56. images = (urlparse.urljoin(url, src) for src in IMG_SRC_XPATH(doc))
  57. # duplicated images are probably layout elements like buttons.
  58. images = (k for k, v in itertools.groupby(images) if len(tuple(v)) < 2)
  59. root_url = root_url or urlparse.urljoin(url, "/")
  60. root_doc = lxml.html.parse(root_url)
  61. root_images = [urlparse.urljoin(root_url, src)
  62. for src in IMG_SRC_XPATH(root_doc)]
  63. return [img.encode(charset) if isinstance(img, unicode) else img
  64. for img in images if img not in root_images]
  65.  
  66.  
  67. def download_urls(urls):
  68. """Downloads files of URLs. This is a generator function that yields
  69. file objects.
  70.  
  71. :param urls: a list of URLs to download
  72. :type urls: iterable object
  73. :returns: a list of downloaded files
  74. :rtype: iterable object
  75.  
  76. """
  77. for url in urls:
  78. u = urllib2.urlopen(url)
  79. f = tempfile.SpooledTemporaryFile()
  80. while True:
  81. buffer = u.read(0x1000)
  82. if buffer:
  83. f.write(buffer)
  84. else:
  85. break
  86. f.seek(0)
  87. u.close()
  88. yield f
  89.  
  90.  
  91. def split_image(image, minarea=DEFAULT_MINAREA,
  92. bgtolerance=DEFAULT_BGTOLERANCE, gap=1, recurse=3):
  93. """Does zealous crop and splits the given ``image`` into several small
  94. images.
  95.  
  96. :param image: a PIL image to split
  97. :param minarea: a minimum width * height size for filtering.
  98. default is :const:`DEFAULT_MINAREA`
  99. :type minarea: :class:`int`, :class:`long`
  100. :param bgtolerance: a tolerance for treating background pixels.
  101. default is :const:`DEFAULT_BGTOLERANCE`
  102. :type bgtolerence: :class:`int`
  103. :param gap: a minmum pixel of gap. default is 1
  104. :type gap: :class:`int`
  105. :param recurse: a maximum recursion depth. default is 3
  106. :type recurse: :class:`int`
  107. :returns: a list of split images
  108. :rtype: iterable object
  109.  
  110. """
  111. if not Image.isImageType(image):
  112. raise TypeError("image must be a PIL image object, not " + repr(image))
  113. if gap < 1:
  114. raise TypeError("gap must be greater than 0")
  115. if image.mode != "RGB":
  116. image = image.convert("RGB")
  117. bgcolor = 255, 255, 255
  118. def _isbg(color):
  119. return sum(abs(a - b) for a, b in zip(color, bgcolor)) <= bgtolerance
  120. if bgtolerance < 1:
  121. _isbg = lambda color: color == bgcolor
  122. bg = Image.new("RGB", image.size, bgcolor)
  123. diff = ImageChops.difference(image, bg)
  124. image = image.crop(diff.getbbox())
  125. pixels = image.load()
  126. width, height = image.size
  127. empty = 0
  128. top = 0
  129. for y in xrange(height):
  130. if all(_isbg(pixels[x, y]) for x in xrange(width)):
  131. empty += 1
  132. elif empty >= gap:
  133. if width * (y - empty - top) >= minarea:
  134. inner_image = image.crop((0, top, width, y - empty))
  135. if recurse > 0:
  136. inner_image = inner_image.rotate(90)
  137. _imgs = split_image(inner_image,
  138. minarea=minarea,
  139. bgtolerance=bgtolerance,
  140. recurse=recurse - 1)
  141. for _i in _imgs:
  142. yield _i.rotate(270)
  143. else:
  144. yield inner_image
  145. empty = 0
  146. top = y
  147. if empty < gap and width * (height - top) >= minarea:
  148. inner_image = image.crop((0, top, width, height))
  149. if recurse:
  150. inner_image = inner_image.rotate(90)
  151. _imgs = split_image(inner_image,
  152. minarea=minarea,
  153. bgtolerance=bgtolerance,
  154. recurse=recurse - 1)
  155. for _i in _imgs:
  156. yield _i.rotate(270)
  157. else:
  158. yield inner_image
  159.  
  160.  
  161. def magic(url, root_url=None, save_path=".",
  162. minarea=DEFAULT_MINAREA, bgtolerance=DEFAULT_BGTOLERANCE):
  163. """Spells and does magic!
  164.  
  165. :param url: the URL to crawl
  166. :type url: :class:`basestring`
  167. :param root_url: an optional URL of the related page or home page
  168. :type root_url: :class:`basestring`
  169. :param save_path: a path to save images
  170. :type save_path: :class:`basestring`
  171. :param minarea: a minimum image area for filtering.
  172. default is :const:`DEFAULT_MINAREA`
  173. :type minarea: :class:`int`, :class:`long`
  174. :param bgtolerance: a tolerance for treating background pixels.
  175. default is :const:`DEFAULT_BGTOLERANCE`
  176. :type bgtolerence: :class:`int`
  177. :returns: a generator that saves images and yields paths
  178. :rtype: iterable object
  179.  
  180. """
  181. parsed_url = urlparse.urlparse(url)
  182. files = download_urls(image_urls(url, root_url))
  183. def _images():
  184. for file in files:
  185. try:
  186. im = Image.open(file)
  187. except IOError:
  188. continue
  189. if im.size[0] * im.size[1] >= minarea:
  190. yield split_image(im, minarea=minarea, bgtolerance=bgtolerance)
  191. m = re.match(r"^https?://[^/]+/(.*)$", url, re.IGNORECASE)
  192. if m:
  193. key = "-".join(d.group(0) for d in re.finditer(r"\d+", m.group(1))) \
  194. + "-"
  195. else:
  196. key = ""
  197. fmt = ("{0}-" + key + "{1:03d}.png").format
  198. n = 1
  199. for image_set in _images():
  200. for im in image_set:
  201. while True:
  202. path = os.path.join(save_path, fmt(parsed_url.hostname, n))
  203. if os.path.isfile(path):
  204. n += 1
  205. else:
  206. break
  207. im.save(path)
  208. n += 1
  209. yield path
  210.  
  211.  
  212. def _download(kwargs):
  213. result = magic(**kwargs)
  214. for path in result:
  215. print path
  216. return kwargs["url"]
  217.  
  218.  
  219. def unique_everseen(iterable, key=None):
  220. """List unique elements, preserving order. Remember all elements ever
  221. seen.
  222.  
  223. .. sourcecode:: pycon
  224.  
  225. >>> list(unique_everseen('AAAABBBCCDAABBB'))
  226. ['A', 'B', 'C', 'D']
  227. >>> unique_everseen('ABBCcAD', str.lower)
  228. ['A', 'B', 'C', 'D']
  229.  
  230. .. note:: Copied from :mod:`itertools` recipes.
  231.  
  232. """
  233. seen = set()
  234. seen_add = seen.add
  235. if key is None:
  236. for element in itertools.ifilterfalse(seen.__contains__, iterable):
  237. seen_add(element)
  238. yield element
  239. else:
  240. for element in iterable:
  241. k = key(element)
  242. if k not in seen:
  243. seen_add(k)
  244. yield element
  245.  
  246.  
  247. def main():
  248. import optparse
  249. import os
  250. import multiprocessing
  251. multiprocessing.freeze_support()
  252. parser = optparse.OptionParser(usage="%prog [options] URL...")
  253. parser.add_option("-r", "--root-url", metavar="URL", default=None,
  254. help="an optional URL of the related page or home page.")
  255. parser.add_option("-d", "--save-path", metavar="DIR", default=".",
  256. help="a DIR path to save images. [%default]")
  257. parser.add_option("-a", "--min-area",
  258. type="int", metavar="AREA", default=DEFAULT_MINAREA,
  259. help="a minimum image area for filtering. [%default]")
  260. parser.add_option("-t", "--bgtolerance", "--background-tolerance",
  261. type="int", metavar="TOLERANCE",
  262. default=DEFAULT_BGTOLERANCE,
  263. help="a TOLERANCE for treating background pixels. "
  264. "[%default]")
  265. parser.add_option("-w", "--workers", type="int", metavar="NUM", default=3,
  266. help="the number of workers [%default]")
  267. options, urls = parser.parse_args()
  268. if not urls:
  269. parser.error("required one or more URLs to crawl.")
  270. urls = itertools.chain.from_iterable(url.split() for url in urls)
  271. urls = list(unique_everseen(urls)) # remove duplicates
  272. if not os.path.isdir(options.save_path):
  273. os.makedirs(options.save_path)
  274. args = [("root_url", options.root_url),
  275. ("save_path", options.save_path),
  276. ("minarea", options.min_area),
  277. ("bgtolerance", options.bgtolerance)]
  278. pool_size = min(len(urls), options.workers)
  279. if pool_size < 2:
  280. args = dict(args)
  281. for i, url in enumerate(urls):
  282. if "root_url" not in args or not args["root_url"]:
  283. args["root_url"] = urls[0 if i > 0 else 1]
  284. _args = dict(args)
  285. _args["url"] = url
  286. _download(_args)
  287. print "[complete]", url
  288. return
  289. pool = multiprocessing.Pool(pool_size)
  290. argslist = []
  291. for i, url in enumerate(urls):
  292. _args = dict(args)
  293. _args["url"] = url
  294. if "root_url" not in _args or not _args["root_url"]:
  295. _args["root_url"] = urls[0 if i > 0 else 1]
  296. argslist.append(_args)
  297. result = pool.imap_unordered(_download, argslist)
  298. for url in result:
  299. print "[complete]", url
  300.  
  301.  
  302. if __name__ == "__main__":
  303. main()
Add Comment
Please, Sign In to add comment