Advertisement
Guest User

2ch risovach thread picture saver

a guest
Nov 4th, 2014
239
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Groovy 1.91 KB | None | 0 0
  1. @Grab('net.sourceforge.nekohtml:nekohtml:1.9.16')
  2.  
  3. def SAVE_FOLDER = new File('I:/save/')
  4. def HASH_LIST_FILE = new File(SAVE_FOLDER, 'list.txt')
  5.  
  6. def synchronized parse (url) {
  7.     Thread.sleep(1000) //TODO sync+sleep is ugly hack
  8.     def parser = new org.cyberneko.html.parsers.SAXParser()
  9.     return new XmlParser(parser).parse(url)
  10. }
  11.  
  12. def synchronized saveImage(URL url, File folder, File listFile) {
  13.     Thread.sleep(1000) //TODO sync+sleep is ugly hack
  14.     def bytes = url.bytes
  15.     def md5 = generateMD5(bytes)
  16.     if (listFile.text.contains(":$md5")) return
  17.     def filename = getFileName(url)
  18.     new File(folder, filename).append(bytes)
  19.     HASH_LIST_FILE.append ("$filename:$md5\n")
  20. }
  21.  
  22. def getFileName(url) {
  23.     return url.file.replaceAll('.*/', '')
  24. }
  25. //assert getFileName(new URL('https://2ch.hk/b/src/78937663/14151422038800.jpg')) == '14151422038800.jpg'
  26.  
  27. def generateMD5(bytes) {
  28.     def digest = java.security.MessageDigest.getInstance("MD5")
  29.     digest.update( bytes, 0, bytes.length )
  30.     return new BigInteger(1, digest.digest()).toString(16).padLeft(32, '0')
  31. }
  32.  
  33. def findRisovach(page) {
  34.     return page.depthFirst().DIV.grep { it.'@class' ==~ /(?i)oppost-wrapper/ }.grep { oppost ->
  35.         def text = oppost.depthFirst().BLOCKQUOTE[0].text()
  36.         if (!text) return false
  37.         text = text.toLowerCase()
  38.         return text.contains('https://secure.flickr.com/photos/104954057@n06/') && text.contains('рисовач')
  39.     }*.@id
  40. }
  41.  
  42. def pages = ['', *((1..5)*.plus('.html'))].collect { 'https://2ch.hk/b/' + it}
  43. def threads = pages.collect { findRisovach(parse(it)) }.flatten()*.replaceFirst(/^post-/, '')
  44. def images = threads.collect { thread ->
  45.     return parse("https://2ch.hk/b/res/${thread}.html").depthFirst().A.grep { it.@class ==~ /desktop/ }*.@href
  46. }.flatten().collect { new URL('https://2ch.hk/'.toURL(), it) }
  47.  
  48. images.each {
  49.     saveImage(it, SAVE_FOLDER, HASH_LIST_FILE)
  50. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement