Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- @Grab('net.sourceforge.nekohtml:nekohtml:1.9.16')
- def SAVE_FOLDER = new File('I:/save/')
- def HASH_LIST_FILE = new File(SAVE_FOLDER, 'list.txt')
- def synchronized parse (url) {
- Thread.sleep(1000) //TODO sync+sleep is ugly hack
- def parser = new org.cyberneko.html.parsers.SAXParser()
- return new XmlParser(parser).parse(url)
- }
- def synchronized saveImage(URL url, File folder, File listFile) {
- Thread.sleep(1000) //TODO sync+sleep is ugly hack
- def bytes = url.bytes
- def md5 = generateMD5(bytes)
- if (listFile.text.contains(":$md5")) return
- def filename = getFileName(url)
- new File(folder, filename).append(bytes)
- HASH_LIST_FILE.append ("$filename:$md5\n")
- }
- def getFileName(url) {
- return url.file.replaceAll('.*/', '')
- }
- //assert getFileName(new URL('https://2ch.hk/b/src/78937663/14151422038800.jpg')) == '14151422038800.jpg'
- def generateMD5(bytes) {
- def digest = java.security.MessageDigest.getInstance("MD5")
- digest.update( bytes, 0, bytes.length )
- return new BigInteger(1, digest.digest()).toString(16).padLeft(32, '0')
- }
- def findRisovach(page) {
- return page.depthFirst().DIV.grep { it.'@class' ==~ /(?i)oppost-wrapper/ }.grep { oppost ->
- def text = oppost.depthFirst().BLOCKQUOTE[0].text()
- if (!text) return false
- text = text.toLowerCase()
- return text.contains('https://secure.flickr.com/photos/104954057@n06/') && text.contains('рисовач')
- }*.@id
- }
- def pages = ['', *((1..5)*.plus('.html'))].collect { 'https://2ch.hk/b/' + it}
- def threads = pages.collect { findRisovach(parse(it)) }.flatten()*.replaceFirst(/^post-/, '')
- def images = threads.collect { thread ->
- return parse("https://2ch.hk/b/res/${thread}.html").depthFirst().A.grep { it.@class ==~ /desktop/ }*.@href
- }.flatten().collect { new URL('https://2ch.hk/'.toURL(), it) }
- images.each {
- saveImage(it, SAVE_FOLDER, HASH_LIST_FILE)
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement