Guest User

sync script

a guest
Aug 30th, 2013
89
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 41.11 KB | None | 0 0
  1. #!/usr/bin/python
  2. # coding=utf8
  3.  
  4. # Copyright (c) 2013, 4d9c64076ff3943f1d6645371b0db6f9b0576a6b7f7c1dfc2c0aa3da65afd74a
  5. # All rights reserved.
  6. #
  7. # Redistribution and use in source and binary forms, with or without
  8. # modification, are permitted provided that the following conditions are met:
  9. # * Redistributions of source code must retain the above copyright
  10. # notice, this list of conditions and the following disclaimer.
  11. # * Redistributions in binary form must reproduce the above copyright
  12. # notice, this list of conditions and the following disclaimer in the
  13. # documentation and/or other materials provided with the distribution.
  14. # * The names of its contributors may not be used to endorse or promote
  15. # products derived from this software without specific prior written
  16. # permission.
  17. # * The source may only be modified by a programmer that has reflected on
  18. # the existence of magical fairies for a period no less than 10 seconds.
  19. # After performing this exercise, the programmer may edit the source for
  20. # up to 4 hours, after which the fairy affirmation exercise must be
  21. # repeated to renew their permission to edit.
  22. #
  23. #
  24. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  25. # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  26. # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  27. # DISCLAIMED. IN NO EVENT SHALL 4d9c64076ff3943f1d6645371b0db6f9b0576a6b7f7c1dfc2c0aa3da65afd74a
  28. # BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  29. # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  30. # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION; OR INJURY INDUCED BY
  31. # FAIRY HALLUCINATIONS) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
  32. # IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
  33. # IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  34. # SUCH DAMAGE.
  35.  
  36. import HTMLParser as html
  37. import httplib as http
  38. import sqlite3 as sq
  39. import time as tm
  40. import datetime as dt
  41. import random as ra
  42. import sys
  43. import os
  44. import threading as thrd
  45. import thread as thd
  46.  
  47.  
  48. class MonkeyPatchHTMLParser(html.HTMLParser):
  49.  
  50. # changed recovery for bad &# sequences.
  51. def goahead(self, end):
  52. rawdata = self.rawdata
  53. i = 0
  54. n = len(rawdata)
  55. while i < n:
  56. match = self.interesting.search(rawdata, i) # < or &
  57. if match:
  58. j = match.start()
  59. else:
  60. if self.cdata_elem:
  61. break
  62. j = n
  63. if i < j: self.handle_data(rawdata[i:j])
  64. i = self.updatepos(i, j)
  65. if i == n: break
  66. startswith = rawdata.startswith
  67. if startswith('<', i):
  68. if html.starttagopen.match(rawdata, i): # < + letter
  69. k = self.parse_starttag(i)
  70. elif startswith("</", i):
  71. k = self.parse_endtag(i)
  72. elif startswith("<!--", i):
  73. k = self.parse_comment(i)
  74. elif startswith("<?", i):
  75. k = self.parse_pi(i)
  76. elif startswith("<!", i):
  77. k = self.parse_html_declaration(i)
  78. elif (i + 1) < n:
  79. self.handle_data("<")
  80. k = i + 1
  81. else:
  82. break
  83. if k < 0:
  84. if not end:
  85. break
  86. k = rawdata.find('>', i + 1)
  87. if k < 0:
  88. k = rawdata.find('<', i + 1)
  89. if k < 0:
  90. k = i + 1
  91. else:
  92. k += 1
  93. self.handle_data(rawdata[i:k])
  94. i = self.updatepos(i, k)
  95. elif startswith("&#", i):
  96. match = html.charref.match(rawdata, i)
  97. if match:
  98. name = match.group()[2:-1]
  99. self.handle_charref(name)
  100. k = match.end()
  101. if not startswith(';', k-1):
  102. k = k - 1
  103. i = self.updatepos(i, k)
  104. continue
  105. else:
  106. # BEGIN MONKEY PATCH
  107. self.handle_data(rawdata[i:i+2])
  108. i = self.updatepos(i, i+2)
  109. continue
  110. # END MONKEY PATCH
  111. elif startswith('&', i):
  112. match = html.entityref.match(rawdata, i)
  113. if match:
  114. name = match.group(1)
  115. self.handle_entityref(name)
  116. k = match.end()
  117. if not startswith(';', k-1):
  118. k = k - 1
  119. i = self.updatepos(i, k)
  120. continue
  121. match = html.incomplete.match(rawdata, i)
  122. if match:
  123. # match.group() will contain at least 2 chars
  124. if end and match.group() == rawdata[i:]:
  125. self.error("EOF in middle of entity or char ref")
  126. # incomplete
  127. break
  128. elif (i + 1) < n:
  129. # not the end of the buffer, and can't be confused
  130. # with some other construct
  131. self.handle_data("&")
  132. i = self.updatepos(i, i + 1)
  133. else:
  134. break
  135. else:
  136. assert 0, "interesting.search() lied"
  137. # end while
  138. if end and i < n and not self.cdata_elem:
  139. self.handle_data(rawdata[i:n])
  140. i = self.updatepos(i, n)
  141. self.rawdata = rawdata[i:]
  142.  
  143. #######
  144. #### BEGIN HTML PARSER
  145. ####
  146.  
  147. class StackMachineHTMLParser(MonkeyPatchHTMLParser):
  148. def __init__(self, handler):
  149. html.HTMLParser.__init__(self)
  150. self.root_handler = handler
  151. self.handlers = []
  152. self.root_handler.beginParsing(self)
  153. def push_handler(self, outer_tag, handler):
  154. handler.beginParsing(self)
  155. self.handlers.append((outer_tag, handler))
  156. def pop_handler(self):
  157. self.handlers.pop()[1].endParsing(self)
  158. def top_handler(self):
  159. if len(self.handlers) > 0:
  160. return self.handlers[-1][1]
  161. else:
  162. return self.root_handler
  163. def handle_starttag(self, tag, attrs):
  164. #print "( start tag", tag, attrs
  165. self.push_handler(tag, self.top_handler().start(self, tag, attrs))
  166. def handle_startendtag(self, tag, attrs):
  167. #print "() startend tag", tag, attrs
  168. self.top_handler().startend(self, tag, attrs)
  169. def handle_data(self, data):
  170. #print "= data", data
  171. self.top_handler().data(self, data)
  172. def handle_entityref(self, name):
  173. #print "= entity", name
  174. self.top_handler().entityref(self, name)
  175. def handle_charref(self, name):
  176. #print "= char", name
  177. self.top_handler().charref(self, name)
  178. def handle_endtag(self, tag):
  179. #print ") end tag", tag, self.handlers[-1][0]
  180. if len(self.handlers) > 0:
  181. i = len(self.handlers) - 1
  182. while i >= 0:
  183. if self.handlers[i][0] == tag:
  184. break
  185. i -= 1
  186. if i != -1:
  187. while len(self.handlers) > i:
  188. self.pop_handler()
  189. self.top_handler().end(self, tag)
  190. def done(self):
  191. self.root_handler.endParsing(self)
  192.  
  193. def attributeGet(attrs, key):
  194. for attr in attrs:
  195. if attr[0] == key:
  196. return attr[1]
  197. return None
  198.  
  199. def getdata(f):
  200. return f.read().decode('utf8', 'replace')
  201.  
  202. class NullHandler:
  203. def beginParsing(self, parser): pass
  204. def start(self, parser, tag, attrs):
  205. return null_handler
  206. def startend(self, parser, tag, attrs): pass
  207. def data(self, parser, data): pass
  208. def entityref(self, parser, name): pass
  209. def charref(self, parser, name): pass
  210. def end(self, parse, tag): pass
  211. def endParsing(self, parser): pass
  212.  
  213. null_handler = NullHandler()
  214.  
  215. class BaseHandler(NullHandler):
  216. def startend(self, parser, tag, attrs):
  217. handler = self.start(parser, tag, attrs)
  218. handler.beginParsing(parser)
  219. handler.endParsing(parser)
  220. self.end(parser, tag)
  221. def entityref(self, parser, name):
  222. self.data(parser, '&' + name + ';')
  223. def charref(self, parser, name):
  224. self.data(parser, '&#' + name + ';')
  225.  
  226. class TextHandler(BaseHandler):
  227. def __init__(self, callback, done_callback):
  228. self.callback = callback
  229. self.done_callback = done_callback
  230. def print_attrs(self, attrs, closing_tag):
  231. for attr in attrs:
  232. if attr[1] == None:
  233. # the parser thinks the attribute has no value...
  234. self.callback(attr[0])
  235. else:
  236. self.callback(' ')
  237. self.callback(attr[0])
  238. self.callback('="')
  239. self.callback(attr[1].replace('"', '\\"').replace("\\","\\\\"))
  240. self.callback('"')
  241. self.callback(closing_tag)
  242. def start(self, parser, tag, attrs):
  243. self.callback('<')
  244. self.callback(tag)
  245. self.print_attrs(attrs, '>')
  246. if self.done_callback == None:
  247. return self
  248. else:
  249. return TextHandler(self.callback, None)
  250. def startend(self, parser, tag, attrs):
  251. self.callback('<')
  252. self.callback(tag)
  253. self.print_attrs(attrs, '/>')
  254. def data(self, parser, data):
  255. self.callback(data)
  256. def end(self, parser, tag):
  257. self.callback('</')
  258. self.callback(tag)
  259. self.callback('>')
  260. def endParsing(self, parser):
  261. if self.done_callback != None:
  262. self.done_callback()
  263.  
  264. def textGatherer(callback):
  265. strings = []
  266. def done():
  267. callback(''.join(strings))
  268. del strings[:]
  269. return TextHandler(strings.append, done)
  270.  
  271. def valueGatherer(dic, key):
  272. def callback(value):
  273. dic[key] = value
  274. return textGatherer(callback)
  275.  
  276. class MatchingHandler(BaseHandler):
  277. def __init__(self, match_tag, match_attrs, next_handler,
  278. done_callback = lambda: None,
  279. attributes_callback = lambda attrs: None):
  280. self.match_tag = match_tag
  281. self.match_attrs = match_attrs
  282. self.next_handler = next_handler
  283. self.done_callback = done_callback
  284. self.attributes_callback = attributes_callback
  285. def start(self, parser, tag, attrs):
  286. attrs_match = True
  287. for match_attr in self.match_attrs:
  288. if not match_attr in attrs:
  289. attrs_match = False
  290. break
  291. if tag == self.match_tag and attrs_match:
  292. self.attributes_callback(attrs)
  293. return self.next_handler
  294. else:
  295. return null_handler
  296. def endParsing(self, parser):
  297. self.done_callback()
  298.  
  299. class MultipleMatchHandler(BaseHandler):
  300. def __init__(self, cases, done_callback = lambda: None):
  301. self.cases = cases
  302. self.done_callback = done_callback
  303. for case in cases:
  304. assert(len(case) == 3)
  305. def start(self, parser, tag, attrs):
  306. for case in self.cases:
  307. match_tag = case[0]
  308. match_attrs = case[1]
  309. next_handler = case[2]
  310. if len(case) > 3:
  311. attributes_callback = case[4]
  312. else:
  313. attributes_callback = lambda attrs: None
  314. attrs_match = True
  315. for match_attr in match_attrs:
  316. if not match_attr in attrs:
  317. attrs_match = False
  318. break
  319. if tag == match_tag and attrs_match:
  320. attributes_callback(attrs)
  321. return next_handler
  322. return null_handler
  323. def endParsing(self, parsing):
  324. self.done_callback()
  325.  
  326. class SequentialMatchHandler(BaseHandler):
  327. def __init__(self, cases, done_callback = lambda: None):
  328. self.cases = cases
  329. self.index = 0
  330. self.done_callback = done_callback
  331. for case in cases:
  332. assert(len(case) == 3)
  333. def start(self, parser, tag, attrs):
  334. if self.index >= len(self.cases):
  335. return null_handler
  336. case = self.cases[self.index]
  337. match_tag = case[0]
  338. match_attrs = case[1]
  339. next_handler = case[2]
  340. attrs_match = True
  341. for match_attr in match_attrs:
  342. if not match_attr in attrs:
  343. attrs_match = False
  344. break
  345. if tag == match_tag and attrs_match:
  346. self.index += 1
  347. return next_handler
  348. return null_handler
  349. def endParsing(self, parser):
  350. self.done_callback()
  351. self.index = 0
  352.  
  353. class DataOverrider(BaseHandler):
  354. def __init__(self, data_handler, tags_handler, done_callback = None):
  355. self.data_handler = data_handler
  356. self.tags_handler = tags_handler
  357. if done_callback == None:
  358. self.done_callback = tags_handler.endParsing
  359. else:
  360. self.done_callback = done_callback
  361. def beginParsing(self, parser):
  362. self.tags_handler.beginParsing(parser)
  363. def start(self, parser, tag, attrs):
  364. return self.tags_handler.start(parser, tag, attrs)
  365. def startend(self, parser, tag, attrs):
  366. self.tags_handler.startend(parser, tag, attrs)
  367. def data(self, parser, data):
  368. self.data_handler.beginParsing(parser)
  369. self.data_handler.data(parser, data)
  370. self.data_handler.endParsing(parser)
  371. def end(self, parser, tag):
  372. return self.tags_handler.end(parser, tag)
  373. def endParsing(self, parser):
  374. self.done_callback(parser)
  375.  
  376. def listPageRowReader(row_callback):
  377. values = {}
  378. def linkGetter(attrs):
  379. val = attributeGet(attrs, 'href')
  380. if val != None:
  381. values['link'] = val
  382. def done():
  383. row_callback(values.copy())
  384. values.clear()
  385. return SequentialMatchHandler([
  386. ['td', [], null_handler],
  387. ['td', [],
  388. MatchingHandler('a', [],
  389. valueGatherer(values, 'title'),
  390. lambda: None,
  391. linkGetter)],
  392. ['td', [], valueGatherer(values, 'numposts')],
  393. ['td', [],
  394. MatchingHandler('small', [],
  395. valueGatherer(values, 'last_post_time'))]],
  396. done)
  397.  
  398. def postReader(post_callback):
  399. name_values = {}
  400. values = {}
  401. def meiruGetter(attrs):
  402. val = attributeGet(attrs, 'href')
  403. if val != None:
  404. values['postermeiru'] = val
  405. def name_done(parser):
  406. values['postername'] = name_values.get('linked_name', name_values.get('unlinked_name', ''))
  407. def done():
  408. post_callback(values.copy())
  409. values.clear()
  410. name_values.clear()
  411. return SequentialMatchHandler([
  412. ['h3', [],
  413. SequentialMatchHandler([
  414. ['span', [('class', 'postnum')],
  415. MatchingHandler('a', [], valueGatherer(values, 'postnum'))],
  416. ['span', [('class', 'postinfo')],
  417. SequentialMatchHandler([
  418. ['span', [('class', 'postername')],
  419. DataOverrider(
  420. valueGatherer(name_values, 'unlinked_name'),
  421. MatchingHandler('a', [],
  422. valueGatherer(name_values, 'linked_name'),
  423. lambda: None,
  424. meiruGetter),
  425. name_done)],
  426. ['span', [('class', 'postertrip')], valueGatherer(values, 'postertrip')],
  427. ['span', [('class', 'id')], valueGatherer(values, 'posterid')],
  428. ['span', [('class', 'posterdate')], valueGatherer(values, 'posterdate')]])]])],
  429. ['blockquote', [],
  430. MatchingHandler('p', [],
  431. valueGatherer(values, 'post'))]],
  432. done)
  433.  
  434. def listPageReader(row_callback, done_callback = lambda: None):
  435. return MatchingHandler('html', [],
  436. MatchingHandler('body', [],
  437. MatchingHandler('div', [('class', 'hborder')],
  438. MatchingHandler('div', [('class', 'head')],
  439. MatchingHandler('table', [('class', 'threads')],
  440. MatchingHandler('tbody', [],
  441. MatchingHandler('tr', [],
  442. listPageRowReader(row_callback))))))),
  443. done_callback)
  444.  
  445. def threadPageReader(title_callback, post_callback, done_callback = lambda: None):
  446. post_reader = postReader(post_callback)
  447. return MatchingHandler('html', [],
  448. MatchingHandler('body', [],
  449. MultipleMatchHandler([
  450. ['h2', [], textGatherer(title_callback)],
  451. ['div', [('class', 'thread')],
  452. MultipleMatchHandler([
  453. ['div', [('class', 'post even')], post_reader],
  454. ['div', [('class', 'post odd')], post_reader]])]])),
  455. done_callback)
  456.  
  457. #######
  458. #### ProgScraper
  459. ####
  460. #### Creates urls and downloads pages using a http(s) connection.
  461. ####
  462.  
  463. class ScraperException: pass
  464.  
  465. class BadResponse(ScraperException):
  466. def __init__(self,response):
  467. self.response = response
  468.  
  469. class ProgScraper:
  470. def __init__(self, conn_gen, min_req_delay = 0, max_req_delay = 2):
  471. self.conn_gen = conn_gen
  472. self.min_req_delay = min_req_delay
  473. self.max_req_delay = max_req_delay
  474. self.rand = ra.Random()
  475. def read_get(self, url):
  476. tm.sleep(self.rand.randint(self.min_req_delay, self.max_req_delay))
  477. print "starting download: ", url
  478. sys.stdout.flush()
  479. conn = self.conn_gen()
  480. conn.request('GET', url)
  481. response = conn.getresponse()
  482. if response.status == 200:
  483. print "finished download: ", url
  484. sys.stdout.flush()
  485. data = getdata(response)
  486. conn.close()
  487. return data
  488. else:
  489. print "bad response getting ", url, ":", response
  490. sys.stdout.flush()
  491. conn.close()
  492. raise BadResponse(response)
  493. def get_subject(self, board):
  494. return self.read_get('/' + board + '/subject.txt')
  495. def get_list_page(self, board, n):
  496. return self.read_get('/list/' + board + '/' + str(n))
  497. def get_read_page(self, board, thread, range_expr = ''):
  498. return self.read_get('/read/' + board + '/' + str(thread) + '/' + range_expr)
  499.  
  500. #######
  501. #### DummyScraper
  502. ####
  503. #### A dummy implementation of ProgScraper for testing that doesn't use any socket.
  504. ####
  505.  
  506. class DummyScraper:
  507. def get_file(self, filename):
  508. file = open(filename, 'r')
  509. data = getdata(file)
  510. file.close()
  511. return data
  512. def get_subject(self, board):
  513. print "DummyScrapper getting subject for", board
  514. sys.stdout.flush()
  515. return self.get_file('subject.txt')
  516. def get_list_page(self, board, n):
  517. print "DummyScrapper getting list for", board, n
  518. sys.stdout.flush()
  519. return self.get_file('list.html')
  520. def get_read_page(self, board, thread, range_expr = ''):
  521. print "DummyScrapper getting read page for", board, thread, range_expr
  522. sys.stdout.flush()
  523. return self.get_file('thread.html')
  524.  
  525. #######
  526. #### ProgParser
  527. ####
  528. #### Routines for extracting data from fetched pages.
  529. ####
  530.  
  531. class ProgParser:
  532. def parse_subject(self, subject):
  533. return [line.split('<>') for line in subject.split('\n')]
  534. subject_indeces = {
  535. 'title' : 0,
  536. 'opname' : 1,
  537. 'noicon' : 2, # wtf is this noicon.png?
  538. 'threadid' : 3,
  539. 'numposts' : 4,
  540. 'lastpostername' : 5,
  541. 'lastpostertime' : 6,
  542. 'numindeces' : 7,
  543. }
  544. def parse_list_page(self, page):
  545. rows = []
  546. reader = StackMachineHTMLParser(listPageReader(rows.append))
  547. reader.feed(page)
  548. reader.close()
  549. return rows
  550. def parse_read_page(self, page):
  551. values = { 'title' : '', 'posts' : []}
  552. def title_callback(value):
  553. values['title'] = value
  554. reader = StackMachineHTMLParser(threadPageReader(title_callback, values['posts'].append))
  555. reader.feed(page)
  556. reader.close()
  557. return values
  558. def parse_timestamp(self, string):
  559. try:
  560. (date_str, time_str) = string.split(' ')
  561. (year_str, month_str, day_str) = date_str.split('-')
  562. (hour_str, minute_str) = time_str.split(':')
  563. return int(dt.datetime(int(year_str), int(month_str), int(day_str), int(hour_str), int(minute_str)).strftime('%s'))
  564. except Exception:
  565. return None
  566.  
  567. #######
  568. #### BEGIN PROGDB
  569. ####
  570. #### Reads and writes data to a sqlite database.
  571. ####
  572.  
  573. class ProgDB:
  574.  
  575. dummykey = 0
  576.  
  577. def __init__(self, conn):
  578. self.conn = conn
  579. def commit(self):
  580. self.conn.commit()
  581. def create(self):
  582. cur = self.conn.cursor()
  583. cur.execute('CREATE TABLE '
  584. 'IF NOT EXISTS '
  585. 'threads (threadid integer,'
  586. 'title text,'
  587. 'board text,'
  588. 'firstposttime integer,'
  589. 'lastposttime integer,'
  590. 'lastbumptime integer,'
  591. 'numposts integer,'
  592. 'PRIMARY KEY (threadid))')
  593. cur.execute('CREATE TABLE '
  594. 'IF NOT EXISTS '
  595. 'posts (threadid integer,'
  596. 'postnum integer,'
  597. 'postername text,'
  598. 'postermeiru text,'
  599. 'postertrip text,'
  600. 'posterid text,'
  601. 'posterdate integer,'
  602. 'post text,'
  603. 'PRIMARY KEY (threadid, postnum),'
  604. 'FOREIGN KEY (threadid) REFERENCES threads(threadid))')
  605. # singleton table
  606. cur.execute('CREATE TABLE '
  607. 'IF NOT EXISTS '
  608. 'syncdata (dummykey integer,'
  609. 'lastsynctime integer,'
  610. 'PRIMARY KEY (dummykey))')
  611. cur.execute('INSERT OR IGNORE INTO syncdata VALUES (?, NULL)', (ProgDB.dummykey,))
  612. cur.execute('CREATE UNIQUE INDEX IF NOT EXISTS threads_id on threads(threadid)')
  613. cur.execute('CREATE INDEX IF NOT EXISTS threads_lastbumptime on threads(board, lastbumptime)')
  614. cur.execute('CREATE INDEX IF NOT EXISTS threads_lastposttime on threads(board, lastposttime)')
  615. cur.execute('CREATE UNIQUE INDEX IF NOT EXISTS posts_id on posts(threadid, postnum)')
  616. def write_thread(self, threadid, title, board, firstposttime, lastposttime, lastbumptime, numposts):
  617. cur = self.conn.cursor()
  618. cur.execute('INSERT OR IGNORE INTO threads values (?,?,?,?,?,?,?)',
  619. (threadid, title, board, firstposttime, lastposttime, lastbumptime, numposts))
  620. def write_empty_thread(self, threadid, title, board):
  621. self.write_thread(threadid, title, board, 0, 0, 0, 0)
  622. def update_thread(self, threadid, firstposttime, lastposttime, lastbumptime, numposts):
  623. cur = self.conn.cursor()
  624. cur.execute('UPDATE threads SET firstposttime = ?,'
  625. 'lastposttime = ?,'
  626. 'lastbumptime = ?,'
  627. 'numposts = max(numposts, ?)'
  628. 'WHERE threadid = ?'
  629. 'LIMIT 1',
  630. (firstposttime, lastposttime, lastbumptime, numposts, threadid))
  631. def insert_post(self, threadid, postnum, postername, postermeiru, postertrip, posterid, posterdate, post):
  632. cur = self.conn.cursor()
  633. cur.execute('INSERT OR IGNORE INTO posts values (?,?,?,?,?,?,?,?)',
  634. (threadid, postnum, postername, postermeiru, postertrip, posterid, posterdate, post) )
  635. def write_post(self, threadid, postnum, postername, postermeiru, postertrip, posterid, posterdate, post):
  636. self.insert_post(threadid, postnum, postername, postermeiru, postertrip, posterid, posterdate, post)
  637. cur = self.conn.cursor()
  638. cur.execute('UPDATE threads SET lastposttime = max(lastposttime, ?),'
  639. 'numposts = max(numposts, ?)'
  640. 'WHERE threadid = ?'
  641. 'LIMIT 1',
  642. (posterdate, postnum, threadid))
  643. if postermeiru != 'mailto:sage':
  644. cur.execute('UPDATE threads SET lastbumptime = max(lastbumptime, ?)'
  645. 'WHERE threadid = ?',
  646. (posterdate, threadid))
  647. if postnum == 1:
  648. cur.execute('UPDATE threads SET firstposttime = ?'
  649. 'WHERE threadid = ?',
  650. (posterdate, threadid))
  651. def get_threads_by_board(self, board, limit = None, offset = None):
  652. cur = self.conn.cursor()
  653. if limit == None and offset == None:
  654. cur.execute('select * from threads where threads.board = ? ORDER BY threads.lastbumptime DESC',
  655. (board,))
  656. elif offset == None:
  657. cur.execute('select * from threads where threads.board = ? ORDER BY threads.lastbumptime DESC limit ?',
  658. (board, limit))
  659. else:
  660. cur.execute('select * from threads where threads.board = ? ORDER BY threads.lastbumptime DESC limit ? offset ?',
  661. (board, limit, offset))
  662. return cur
  663. def get_posts_by_thread(self, threadid, limit = None, offset = None):
  664. cur = self.conn.cursor()
  665. if limit == None and offset == None:
  666. cur.execute('select * from posts where posts.threadid = ? order by posts.postnum',
  667. (threadid,))
  668. elif offset == None:
  669. cur.execute('select * from posts where posts.threadid = ? order by posts.postnum limit ?',
  670. (threadid, limit))
  671. else:
  672. cur.execute('select * from posts where posts.threadid = ? order by posts.postnum limit ? offset ?',
  673. (threadid, limit, offset))
  674. return cur
  675. def get_thread(self, threadid):
  676. cur = self.conn.cursor()
  677. cur.execute('select * from threads where threads.threadid = ?', (threadid,))
  678. return cur.fetchone()
  679. def get_post(self, threadid, postnum):
  680. cur = self.conn.cursor()
  681. cur.execute('select * from posts where posts.threadid = ? and posts.postnum = ?', (threadid, postnum))
  682. return cur.fetchone()
  683.  
  684. class ThreadEditor:
  685. def __init__(self, db, threadid, firstposttime, lastposttime, lastbumptime, numposts):
  686. self.db = db
  687. self.threadid = threadid
  688. self.firstposttime = firstposttime
  689. self.lastposttime = lastposttime
  690. self.lastbumptime = lastbumptime
  691. self.numposts = numposts
  692. def add_post(self, postnum, postermeiru, posterdate):
  693. if postnum == 1:
  694. self.firstposttime = posterdate
  695. self.lastposttime = max(self.lastposttime, posterdate)
  696. if postermeiru != 'mailto:sage':
  697. self.lastbumptime = max(self.lastbumptime, posterdate)
  698. self.numposts = max(self.numposts, postnum)
  699. def write_back(self):
  700. self.db.update_thread(self.threadid, self.firstposttime, self.lastposttime, self.lastbumptime, self.numposts)
  701.  
  702. def get_thread_editor(self, threadid):
  703. (threadid_, title, board, firstposttime, lastposttime, lastbumptime, numposts) = self.get_thread(threadid)
  704. assert(threadid_ == threadid)
  705. return ProgDB.ThreadEditor(self, threadid, firstposttime, lastposttime, lastbumptime, numposts)
  706.  
  707. def get_syncdata(self):
  708. cur = self.conn.cursor()
  709. cur.execute('SELECT * FROM syncdata WHERE dummykey = ? LIMIT 1', (ProgDB.dummykey,))
  710. return cur.fetchone()
  711.  
  712. def set_lastsynctime(self, lastsynctime):
  713. cur = self.conn.cursor()
  714. cur.execute('UPDATE syncdata SET lastsynctime = ?'
  715. 'WHERE dummykey = ?',
  716. (lastsynctime, ProgDB.dummykey))
  717.  
  718. def get_lastsynctime(self):
  719. (dummykey, lastsynctime) = self.get_syncdata()
  720. return lastsynctime
  721.  
  722. def current_progtime(self):
  723. return int((dt.datetime.utcnow() - dt.timedelta(hours=4)).strftime('%s'))
  724.  
  725.  
  726.  
  727.  
  728. thread_keys = ('threadid', 'title', 'board', 'firstposttime', 'lastposttime', 'lastbumptime', 'numposts')
  729. post_keys = ('threadid', 'postnum', 'postername', 'postermeiru', 'postertrip', 'posterid', 'posterdate', 'post')
  730.  
  731. def reverse_array_map(arr, inverse_map):
  732. for i in xrange(0, len(arr)):
  733. inverse_map[arr[i]] = i
  734.  
  735. thread_key_indeces = {}
  736. reverse_array_map(thread_keys, thread_key_indeces)
  737.  
  738. post_key_indeces = {}
  739. reverse_array_map(post_keys, post_key_indeces)
  740.  
  741. def print_row(keys, values, indent = ''):
  742. assert(len(keys) == len(values))
  743. print indent, [(keys[i], values[i]) for i in xrange(0,len(keys))]
  744. sys.stdout.flush()
  745.  
  746. def printDB(progdb):
  747. cur = progdb.get_threads_by_board('prog')
  748. for thread in cur:
  749. threadid = thread[0]
  750. print_row(thread_keys, thread)
  751. posts = progdb.get_posts_by_thread(threadid)
  752. for post in posts:
  753. print_row(post_keys, post, ' ')
  754.  
  755. #######
  756. #### BEGIN ProgSynch
  757. ####
  758. #### Takes data from ProgScraper and updates a ProgDB.
  759. ####
  760.  
  761. class ProgSynch:
  762. def __init__(self, scraper, parser, db):
  763. self.scraper = scraper
  764. self.parser = parser
  765. self.db = db
  766. self.abortOp = False
  767. def abort(self):
  768. self.abortOp = True
  769. def reset(self):
  770. self.abortOp = False
  771. def commit(self):
  772. self.db.commit()
  773. def write_thread_page_data(self, board, threadid, page, expected_postnum_start = 1):
  774. parsed_page = self.parser.parse_read_page(page)
  775. if not 'title' in parsed_page:
  776. print "warning: malformed thread, could not read title:", parsed_page
  777. sys.stdout.flush()
  778. self.db.write_empty_thread(threadid, parsed_page.get('title', ''), board)
  779. editor = self.db.get_thread_editor(threadid)
  780. backup_postnum = expected_postnum_start
  781. for post in parsed_page['posts']:
  782. postnum_valid = False
  783. # keep a post num count for recovery
  784. try:
  785. postnum = int(post['postnum'])
  786. backup_postnum = postnum
  787. postnum_valid = True
  788. except Exception:
  789. postnum = backup_postnum
  790. backup_postnum += 1
  791. postername = post.get('postername', '')
  792. postermeiru = post.get('postermeiru', '')
  793. postertrip = post.get('postertrip', '')
  794. posterid = post.get('posterid', '')
  795. posterdate = post.get('posterdate', '')
  796. text = post.get('post', '')
  797. if not postnum_valid or \
  798. not 'postername' in post or \
  799. not 'postertrip' in post or \
  800. not 'posterid' in post or \
  801. not 'posterdate' in post or \
  802. not 'post' in post:
  803. print "warning: malformed post in read page.", post
  804. sys.stdout.flush()
  805. editor.add_post(postnum, postermeiru, posterdate)
  806. self.db.insert_post(threadid, postnum, postername, postermeiru, postertrip, posterid, posterdate, text)
  807. editor.write_back()
  808. def write_thread_page_file(self, board, threadid, filename):
  809. file = open(filename, 'r')
  810. page = getdata(file)
  811. file.close()
  812. self.write_thread_page_data(board, threadid, page)
  813. def write_thread_page(self, board, threadid, range_expr = ''):
  814. page = self.scraper.get_read_page(board, threadid, range_expr)
  815. self.write_thread_page_data(board, threadid, page)
  816. def synch_subject_page_data(self, board, page):
  817. parsed_page = self.parser.parse_subject(page)
  818. modified = False
  819. current_synctime = self.db.current_progtime()
  820. lastsynctime = self.db.get_lastsynctime()
  821. for entry in parsed_page:
  822. if self.abortOp:
  823. break
  824. try:
  825. assert(len(entry) == ProgParser.subject_indeces['numindeces'])
  826. threadid = int(entry[ProgParser.subject_indeces['threadid']])
  827. numposts = int(entry[ProgParser.subject_indeces['numposts']])
  828. lastposttime = int(entry[ProgParser.subject_indeces['lastpostertime']])
  829. local_thread = self.db.get_thread(threadid)
  830. if local_thread == None:
  831. # thread does not exist in local storage. Synch the entire thread.
  832. #print "local thread not found", threadid, numposts
  833. self.write_thread_page(board, threadid)
  834. modifed = True
  835. else:
  836. local_numposts = int(local_thread[thread_key_indeces['numposts']])
  837. #print "local thread found", threadid, "local", local_numposts, "subject", numposts, "last post time", lastposttime, "last sync time", lastsynctime
  838. if lastsynctime <= lastposttime and local_numposts < numposts:
  839. # new posts have appeared in the thread. Synch the new posts.
  840. self.write_thread_page(board, threadid, str(local_numposts + 1) + '-')
  841. modifed = True
  842. except Exception as e:
  843. print "malformed entry in subject page. Could not synch:", entry
  844. print e.message, e.args
  845. sys.stdout.flush()
  846. self.db.set_lastsynctime(current_synctime)
  847. return modified
  848. def synch_subject_page_file(self, board, filename):
  849. file = open(filename, 'r')
  850. page = getdata(file)
  851. file.close()
  852. return self.synch_subject_page_data(board, page)
  853. def synch_subject_page(self, board):
  854. page = self.scraper.get_subject(board)
  855. return self.synch_subject_page_data(board, page)
  856. def synch_list_page_data(self, board, page):
  857. parsed_page = self.parser.parse_list_page(page)
  858. lastsynchtime = self.db.get_lastsynctime()
  859. modified = False
  860. for entry in parsed_page:
  861. if self.abortOp:
  862. break
  863. try:
  864. link = entry['link']
  865. numposts = int(entry['numposts'])
  866. last_post_time = entry['last_post_time']
  867. last_post_timestamp = self.parser.parse_timestamp(last_post_time)
  868. prefix = 'read/' + board + '/'
  869. assert(link.startswith(prefix))
  870. threadid = int(link[len(prefix):])
  871. local_thread = self.db.get_thread(threadid)
  872. if local_thread == None:
  873. # thread does not exist in local db. Synch entire thread.
  874. self.write_thread_page(board, threadid)
  875. modified = True
  876. else:
  877. local_numposts = local_thread[thread_key_indeces['numposts']]
  878. if (last_post_timestamp == None or lastsynctime < last_post_timestamp) and \
  879. local_numposts < numposts:
  880. # synch the new posts
  881. self.write_thread_page(board, threadid, str(local_numposts + 1) + '-')
  882. except Exception as e:
  883. print "error: malformed entry in list page. Could not synch:", entry
  884. print e.message, e.args
  885. sys.stdout.flush()
  886. return modified
  887. def synch_list_page_file(self, board, filename):
  888. file = open(filename, 'r')
  889. page = getdata(file)
  890. file.close()
  891. return self.synch_list_page_data(board, page)
  892. def synch_list_page(self, board, list_page_num):
  893. page = self.scraper.get_list_page(board, list_page_num)
  894. return self.synch_list_page_data(board, page)
  895. def synch_top_list_pages(self, board, num_pages):
  896. for i in xrange(1,num_pages + 1):
  897. if self.abortOp:
  898. break
  899. self.synch_list_page(board, i)
  900. def synch_list_pages_while_modifying(self, board):
  901. modified = True # kick start
  902. index = 1
  903. while modified:
  904. if self.abortOp:
  905. break
  906. modified = self.synch_list_page(board, index)
  907. index += 1
  908. def import_threads_dir(self, board, dirname, filename_to_threadid = lambda filename: int(filename)):
  909. for filename in os.listdir(dirname):
  910. if self.abortOp:
  911. break
  912. print "importing thread", filename
  913. sys.stdout.flush()
  914. threadid = filename_to_threadid(filename)
  915. self.write_thread_page_file(board, threadid, dirname + '/' + filename)
  916. def import_nested_threads_dir(self, board, dirname, filename_to_threadid = lambda filename: int(filename)):
  917. for (path, subdirs, filenames) in os.walk(dirname):
  918. if self.abortOp:
  919. break
  920. print "importing all threads under ", path
  921. sys.stdout.flush()
  922. for filename in filenames:
  923. if self.abortOp:
  924. break
  925. threadid = 0
  926. try:
  927. threadid = filename_to_threadid(filename)
  928. except Exception:
  929. continue
  930. filepath = path + '/' + filename
  931. print "importing thread", filepath
  932. sys.stdout.flush()
  933. self.write_thread_page_file(board, threadid, filepath)
  934.  
  935. #######
  936. #### BEGIN ProgAsynch
  937. ####
  938. #### Issues async commands to a ProgSync and can abort commands.
  939. ####
  940.  
  941. class ProgAsynch:
  942. def __init__(self, sync):
  943. self.sync = sync
  944. self.running_thread = None
  945. def start_command(self, command):
  946. if self.running_thread == None:
  947. self.sync.reset()
  948. self.running_thread = thrd.Thread(target=command,args=(self.sync,))
  949. self.running_thread.start()
  950. return True
  951. else:
  952. # another command is running...
  953. return False
  954. def abort(self):
  955. if self.running_thread != None:
  956. self.sync.abort()
  957. def is_done(self):
  958. return self.running_thread == None or not self.running_thread.is_alive()
  959. def wait(self):
  960. if self.running_thread != None:
  961. self.running_thread.join()
  962. self.running_thread = None
  963.  
  964. def parser_test1():
  965. scraper = DummyScraper()
  966. parser = ProgParser()
  967. page = scraper.get_read_page('prog', 1337)
  968. parsed_page = parser.parse_read_page(page)
  969. print "parsed page"
  970. print parsed_page
  971.  
  972. def synch_test1():
  973. scraper = DummyScraper()
  974. parser = ProgParser()
  975. dbname = 'prog.db'
  976. dbconn = sq.connect(dbname)
  977. db = ProgDB(dbconn)
  978. db.create()
  979. synch = ProgSynch(scraper, parser, db)
  980. synch.write_thread_page('prog', 1337)
  981. synch.commit()
  982. #printDB(db)
  983. dbconn.close()
  984.  
  985. def synch_test3():
  986. scraper = DummyScraper()
  987. parser = ProgParser()
  988. dbname = 'prog.db'
  989. dbconn = sq.connect(dbname)
  990. db = ProgDB(dbconn)
  991. db.create()
  992. synch = ProgSynch(scraper, parser, db)
  993. synch.synch_subject_page('prog')
  994. synch.commit()
  995. #printDB(db)
  996. dbconn.close()
  997.  
  998. def synch_test4():
  999. scraper = DummyScraper()
  1000. parser = ProgParser()
  1001. dbname = 'prog.db'
  1002. dbconn = sq.connect(dbname)
  1003. db = ProgDB(dbconn)
  1004. db.create()
  1005. synch = ProgSynch(scraper, parser, db)
  1006. synch.synch_top_list_pages('prog', 5)
  1007. synch.commit()
  1008. #printDB(db)
  1009. dbconn.close()
  1010.  
  1011. def synch_test5():
  1012. scraper = DummyScraper()
  1013. parser = ProgParser()
  1014. dbname = 'prog.db'
  1015. dbconn = sq.connect(dbname)
  1016. db = ProgDB(dbconn)
  1017. db.create()
  1018. synch = ProgSynch(scraper, parser, db)
  1019. def filename_to_threadid(filename):
  1020. assert(filename.endswith('.html'))
  1021. id = int(filename[0:len(filename) - len('.html')])
  1022. return id
  1023. synch.import_threads_dir('prog', 'threads2', filename_to_threadid)
  1024. synch.commit()
  1025. #printDB(db)
  1026. dbconn.close()
  1027.  
  1028. def synch_test6():
  1029. scraper = DummyScraper()
  1030. parser = ProgParser()
  1031. dbname = 'prog.db'
  1032. dbconn = sq.connect(dbname)
  1033. db = ProgDB(dbconn)
  1034. db.create()
  1035. synch = ProgSynch(scraper, parser, db)
  1036. def filename_to_threadid(filename):
  1037. assert(filename.endswith('.html'))
  1038. return int(filename[0:len(filename) - len('.html')])
  1039. synch.import_nested_threads_dir('prog', 'threads', filename_to_threadid)
  1040. synch.commit()
  1041. #printDB(db)
  1042. dbconn.close()
  1043.  
  1044. def demo1():
  1045. conn_gen = lambda: http.HTTPSConnection('dis.4chan.org', timeout=20)
  1046. scraper = ProgScraper(conn_gen)
  1047. parser = ProgParser()
  1048. dbname = 'prog.db'
  1049. dbconn = sq.connect(dbname)
  1050. db = ProgDB(dbconn)
  1051. db.create()
  1052. synch = ProgSynch(scraper, parser, db)
  1053. synch.synch_top_list_pages('prog', 5)
  1054. synch.commit()
  1055. #printDB(db)
  1056. dbconn.close()
  1057.  
  1058. # sync all of /prog/
  1059. def demo2():
  1060. conn_gen = lambda: http.HTTPSConnection('dis.4chan.org', timeout=20)
  1061. scraper = ProgScraper(conn_gen)
  1062. parser = ProgParser()
  1063. dbname = 'prog.db'
  1064. dbconn = sq.connect(dbname)
  1065. db = ProgDB(dbconn)
  1066. db.create()
  1067. synch = ProgSynch(scraper, parser, db)
  1068. synch.synch_subject_page('prog')
  1069. synch.commit()
  1070. #printDB(db)
  1071. dbconn.close()
  1072.  
  1073. # sync from dummy subject but then stop after 30 seconds.
  1074. def async_test1():
  1075. scraper = DummyScraper()
  1076. parser = ProgParser()
  1077. dbname = 'prog.db'
  1078. dbconn = sq.connect(dbname, check_same_thread=False)
  1079. db = ProgDB(dbconn)
  1080. db.create()
  1081. synch = ProgSynch(scraper, parser, db)
  1082. asynch = ProgAsynch(synch)
  1083.  
  1084. def command(synch):
  1085. synch.synch_subject_page('prog')
  1086. synch.commit()
  1087.  
  1088. asynch.start_command(command)
  1089. tm.sleep(30)
  1090. asynch.abort()
  1091. asynch.wait()
  1092.  
  1093. #printDB(db)
  1094. dbconn.close()
Advertisement
Add Comment
Please, Sign In to add comment