Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python
- # coding=utf8
- # Copyright (c) 2013, 4d9c64076ff3943f1d6645371b0db6f9b0576a6b7f7c1dfc2c0aa3da65afd74a
- # All rights reserved.
- #
- # Redistribution and use in source and binary forms, with or without
- # modification, are permitted provided that the following conditions are met:
- # * Redistributions of source code must retain the above copyright
- # notice, this list of conditions and the following disclaimer.
- # * Redistributions in binary form must reproduce the above copyright
- # notice, this list of conditions and the following disclaimer in the
- # documentation and/or other materials provided with the distribution.
- # * The names of its contributors may not be used to endorse or promote
- # products derived from this software without specific prior written
- # permission.
- # * The source may only be modified by a programmer that has reflected on
- # the existence of magical fairies for a period no less than 10 seconds.
- # After performing this exercise, the programmer may edit the source for
- # up to 4 hours, after which the fairy affirmation exercise must be
- # repeated to renew their permission to edit.
- #
- #
- # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- # DISCLAIMED. IN NO EVENT SHALL 4d9c64076ff3943f1d6645371b0db6f9b0576a6b7f7c1dfc2c0aa3da65afd74a
- # BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION; OR INJURY INDUCED BY
- # FAIRY HALLUCINATIONS) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- # IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
- # IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- # SUCH DAMAGE.
- import HTMLParser as html
- import httplib as http
- import sqlite3 as sq
- import time as tm
- import datetime as dt
- import random as ra
- import sys
- import os
- import threading as thrd
- import thread as thd
- class MonkeyPatchHTMLParser(html.HTMLParser):
- # changed recovery for bad &# sequences.
- def goahead(self, end):
- rawdata = self.rawdata
- i = 0
- n = len(rawdata)
- while i < n:
- match = self.interesting.search(rawdata, i) # < or &
- if match:
- j = match.start()
- else:
- if self.cdata_elem:
- break
- j = n
- if i < j: self.handle_data(rawdata[i:j])
- i = self.updatepos(i, j)
- if i == n: break
- startswith = rawdata.startswith
- if startswith('<', i):
- if html.starttagopen.match(rawdata, i): # < + letter
- k = self.parse_starttag(i)
- elif startswith("</", i):
- k = self.parse_endtag(i)
- elif startswith("<!--", i):
- k = self.parse_comment(i)
- elif startswith("<?", i):
- k = self.parse_pi(i)
- elif startswith("<!", i):
- k = self.parse_html_declaration(i)
- elif (i + 1) < n:
- self.handle_data("<")
- k = i + 1
- else:
- break
- if k < 0:
- if not end:
- break
- k = rawdata.find('>', i + 1)
- if k < 0:
- k = rawdata.find('<', i + 1)
- if k < 0:
- k = i + 1
- else:
- k += 1
- self.handle_data(rawdata[i:k])
- i = self.updatepos(i, k)
- elif startswith("&#", i):
- match = html.charref.match(rawdata, i)
- if match:
- name = match.group()[2:-1]
- self.handle_charref(name)
- k = match.end()
- if not startswith(';', k-1):
- k = k - 1
- i = self.updatepos(i, k)
- continue
- else:
- # BEGIN MONKEY PATCH
- self.handle_data(rawdata[i:i+2])
- i = self.updatepos(i, i+2)
- continue
- # END MONKEY PATCH
- elif startswith('&', i):
- match = html.entityref.match(rawdata, i)
- if match:
- name = match.group(1)
- self.handle_entityref(name)
- k = match.end()
- if not startswith(';', k-1):
- k = k - 1
- i = self.updatepos(i, k)
- continue
- match = html.incomplete.match(rawdata, i)
- if match:
- # match.group() will contain at least 2 chars
- if end and match.group() == rawdata[i:]:
- self.error("EOF in middle of entity or char ref")
- # incomplete
- break
- elif (i + 1) < n:
- # not the end of the buffer, and can't be confused
- # with some other construct
- self.handle_data("&")
- i = self.updatepos(i, i + 1)
- else:
- break
- else:
- assert 0, "interesting.search() lied"
- # end while
- if end and i < n and not self.cdata_elem:
- self.handle_data(rawdata[i:n])
- i = self.updatepos(i, n)
- self.rawdata = rawdata[i:]
- #######
- #### BEGIN HTML PARSER
- ####
- class StackMachineHTMLParser(MonkeyPatchHTMLParser):
- def __init__(self, handler):
- html.HTMLParser.__init__(self)
- self.root_handler = handler
- self.handlers = []
- self.root_handler.beginParsing(self)
- def push_handler(self, outer_tag, handler):
- handler.beginParsing(self)
- self.handlers.append((outer_tag, handler))
- def pop_handler(self):
- self.handlers.pop()[1].endParsing(self)
- def top_handler(self):
- if len(self.handlers) > 0:
- return self.handlers[-1][1]
- else:
- return self.root_handler
- def handle_starttag(self, tag, attrs):
- #print "( start tag", tag, attrs
- self.push_handler(tag, self.top_handler().start(self, tag, attrs))
- def handle_startendtag(self, tag, attrs):
- #print "() startend tag", tag, attrs
- self.top_handler().startend(self, tag, attrs)
- def handle_data(self, data):
- #print "= data", data
- self.top_handler().data(self, data)
- def handle_entityref(self, name):
- #print "= entity", name
- self.top_handler().entityref(self, name)
- def handle_charref(self, name):
- #print "= char", name
- self.top_handler().charref(self, name)
- def handle_endtag(self, tag):
- #print ") end tag", tag, self.handlers[-1][0]
- if len(self.handlers) > 0:
- i = len(self.handlers) - 1
- while i >= 0:
- if self.handlers[i][0] == tag:
- break
- i -= 1
- if i != -1:
- while len(self.handlers) > i:
- self.pop_handler()
- self.top_handler().end(self, tag)
- def done(self):
- self.root_handler.endParsing(self)
- def attributeGet(attrs, key):
- for attr in attrs:
- if attr[0] == key:
- return attr[1]
- return None
- def getdata(f):
- return f.read().decode('utf8', 'replace')
- class NullHandler:
- def beginParsing(self, parser): pass
- def start(self, parser, tag, attrs):
- return null_handler
- def startend(self, parser, tag, attrs): pass
- def data(self, parser, data): pass
- def entityref(self, parser, name): pass
- def charref(self, parser, name): pass
- def end(self, parse, tag): pass
- def endParsing(self, parser): pass
- null_handler = NullHandler()
- class BaseHandler(NullHandler):
- def startend(self, parser, tag, attrs):
- handler = self.start(parser, tag, attrs)
- handler.beginParsing(parser)
- handler.endParsing(parser)
- self.end(parser, tag)
- def entityref(self, parser, name):
- self.data(parser, '&' + name + ';')
- def charref(self, parser, name):
- self.data(parser, '&#' + name + ';')
- class TextHandler(BaseHandler):
- def __init__(self, callback, done_callback):
- self.callback = callback
- self.done_callback = done_callback
- def print_attrs(self, attrs, closing_tag):
- for attr in attrs:
- if attr[1] == None:
- # the parser thinks the attribute has no value...
- self.callback(attr[0])
- else:
- self.callback(' ')
- self.callback(attr[0])
- self.callback('="')
- self.callback(attr[1].replace('"', '\\"').replace("\\","\\\\"))
- self.callback('"')
- self.callback(closing_tag)
- def start(self, parser, tag, attrs):
- self.callback('<')
- self.callback(tag)
- self.print_attrs(attrs, '>')
- if self.done_callback == None:
- return self
- else:
- return TextHandler(self.callback, None)
- def startend(self, parser, tag, attrs):
- self.callback('<')
- self.callback(tag)
- self.print_attrs(attrs, '/>')
- def data(self, parser, data):
- self.callback(data)
- def end(self, parser, tag):
- self.callback('</')
- self.callback(tag)
- self.callback('>')
- def endParsing(self, parser):
- if self.done_callback != None:
- self.done_callback()
- def textGatherer(callback):
- strings = []
- def done():
- callback(''.join(strings))
- del strings[:]
- return TextHandler(strings.append, done)
- def valueGatherer(dic, key):
- def callback(value):
- dic[key] = value
- return textGatherer(callback)
- class MatchingHandler(BaseHandler):
- def __init__(self, match_tag, match_attrs, next_handler,
- done_callback = lambda: None,
- attributes_callback = lambda attrs: None):
- self.match_tag = match_tag
- self.match_attrs = match_attrs
- self.next_handler = next_handler
- self.done_callback = done_callback
- self.attributes_callback = attributes_callback
- def start(self, parser, tag, attrs):
- attrs_match = True
- for match_attr in self.match_attrs:
- if not match_attr in attrs:
- attrs_match = False
- break
- if tag == self.match_tag and attrs_match:
- self.attributes_callback(attrs)
- return self.next_handler
- else:
- return null_handler
- def endParsing(self, parser):
- self.done_callback()
- class MultipleMatchHandler(BaseHandler):
- def __init__(self, cases, done_callback = lambda: None):
- self.cases = cases
- self.done_callback = done_callback
- for case in cases:
- assert(len(case) == 3)
- def start(self, parser, tag, attrs):
- for case in self.cases:
- match_tag = case[0]
- match_attrs = case[1]
- next_handler = case[2]
- if len(case) > 3:
- attributes_callback = case[4]
- else:
- attributes_callback = lambda attrs: None
- attrs_match = True
- for match_attr in match_attrs:
- if not match_attr in attrs:
- attrs_match = False
- break
- if tag == match_tag and attrs_match:
- attributes_callback(attrs)
- return next_handler
- return null_handler
- def endParsing(self, parsing):
- self.done_callback()
- class SequentialMatchHandler(BaseHandler):
- def __init__(self, cases, done_callback = lambda: None):
- self.cases = cases
- self.index = 0
- self.done_callback = done_callback
- for case in cases:
- assert(len(case) == 3)
- def start(self, parser, tag, attrs):
- if self.index >= len(self.cases):
- return null_handler
- case = self.cases[self.index]
- match_tag = case[0]
- match_attrs = case[1]
- next_handler = case[2]
- attrs_match = True
- for match_attr in match_attrs:
- if not match_attr in attrs:
- attrs_match = False
- break
- if tag == match_tag and attrs_match:
- self.index += 1
- return next_handler
- return null_handler
- def endParsing(self, parser):
- self.done_callback()
- self.index = 0
- class DataOverrider(BaseHandler):
- def __init__(self, data_handler, tags_handler, done_callback = None):
- self.data_handler = data_handler
- self.tags_handler = tags_handler
- if done_callback == None:
- self.done_callback = tags_handler.endParsing
- else:
- self.done_callback = done_callback
- def beginParsing(self, parser):
- self.tags_handler.beginParsing(parser)
- def start(self, parser, tag, attrs):
- return self.tags_handler.start(parser, tag, attrs)
- def startend(self, parser, tag, attrs):
- self.tags_handler.startend(parser, tag, attrs)
- def data(self, parser, data):
- self.data_handler.beginParsing(parser)
- self.data_handler.data(parser, data)
- self.data_handler.endParsing(parser)
- def end(self, parser, tag):
- return self.tags_handler.end(parser, tag)
- def endParsing(self, parser):
- self.done_callback(parser)
- def listPageRowReader(row_callback):
- values = {}
- def linkGetter(attrs):
- val = attributeGet(attrs, 'href')
- if val != None:
- values['link'] = val
- def done():
- row_callback(values.copy())
- values.clear()
- return SequentialMatchHandler([
- ['td', [], null_handler],
- ['td', [],
- MatchingHandler('a', [],
- valueGatherer(values, 'title'),
- lambda: None,
- linkGetter)],
- ['td', [], valueGatherer(values, 'numposts')],
- ['td', [],
- MatchingHandler('small', [],
- valueGatherer(values, 'last_post_time'))]],
- done)
- def postReader(post_callback):
- name_values = {}
- values = {}
- def meiruGetter(attrs):
- val = attributeGet(attrs, 'href')
- if val != None:
- values['postermeiru'] = val
- def name_done(parser):
- values['postername'] = name_values.get('linked_name', name_values.get('unlinked_name', ''))
- def done():
- post_callback(values.copy())
- values.clear()
- name_values.clear()
- return SequentialMatchHandler([
- ['h3', [],
- SequentialMatchHandler([
- ['span', [('class', 'postnum')],
- MatchingHandler('a', [], valueGatherer(values, 'postnum'))],
- ['span', [('class', 'postinfo')],
- SequentialMatchHandler([
- ['span', [('class', 'postername')],
- DataOverrider(
- valueGatherer(name_values, 'unlinked_name'),
- MatchingHandler('a', [],
- valueGatherer(name_values, 'linked_name'),
- lambda: None,
- meiruGetter),
- name_done)],
- ['span', [('class', 'postertrip')], valueGatherer(values, 'postertrip')],
- ['span', [('class', 'id')], valueGatherer(values, 'posterid')],
- ['span', [('class', 'posterdate')], valueGatherer(values, 'posterdate')]])]])],
- ['blockquote', [],
- MatchingHandler('p', [],
- valueGatherer(values, 'post'))]],
- done)
- def listPageReader(row_callback, done_callback = lambda: None):
- return MatchingHandler('html', [],
- MatchingHandler('body', [],
- MatchingHandler('div', [('class', 'hborder')],
- MatchingHandler('div', [('class', 'head')],
- MatchingHandler('table', [('class', 'threads')],
- MatchingHandler('tbody', [],
- MatchingHandler('tr', [],
- listPageRowReader(row_callback))))))),
- done_callback)
- def threadPageReader(title_callback, post_callback, done_callback = lambda: None):
- post_reader = postReader(post_callback)
- return MatchingHandler('html', [],
- MatchingHandler('body', [],
- MultipleMatchHandler([
- ['h2', [], textGatherer(title_callback)],
- ['div', [('class', 'thread')],
- MultipleMatchHandler([
- ['div', [('class', 'post even')], post_reader],
- ['div', [('class', 'post odd')], post_reader]])]])),
- done_callback)
- #######
- #### ProgScraper
- ####
- #### Creates urls and downloads pages using a http(s) connection.
- ####
- class ScraperException: pass
- class BadResponse(ScraperException):
- def __init__(self,response):
- self.response = response
- class ProgScraper:
- def __init__(self, conn_gen, min_req_delay = 0, max_req_delay = 2):
- self.conn_gen = conn_gen
- self.min_req_delay = min_req_delay
- self.max_req_delay = max_req_delay
- self.rand = ra.Random()
- def read_get(self, url):
- tm.sleep(self.rand.randint(self.min_req_delay, self.max_req_delay))
- print "starting download: ", url
- sys.stdout.flush()
- conn = self.conn_gen()
- conn.request('GET', url)
- response = conn.getresponse()
- if response.status == 200:
- print "finished download: ", url
- sys.stdout.flush()
- data = getdata(response)
- conn.close()
- return data
- else:
- print "bad response getting ", url, ":", response
- sys.stdout.flush()
- conn.close()
- raise BadResponse(response)
- def get_subject(self, board):
- return self.read_get('/' + board + '/subject.txt')
- def get_list_page(self, board, n):
- return self.read_get('/list/' + board + '/' + str(n))
- def get_read_page(self, board, thread, range_expr = ''):
- return self.read_get('/read/' + board + '/' + str(thread) + '/' + range_expr)
- #######
- #### DummyScraper
- ####
- #### A dummy implementation of ProgScraper for testing that doesn't use any socket.
- ####
- class DummyScraper:
- def get_file(self, filename):
- file = open(filename, 'r')
- data = getdata(file)
- file.close()
- return data
- def get_subject(self, board):
- print "DummyScrapper getting subject for", board
- sys.stdout.flush()
- return self.get_file('subject.txt')
- def get_list_page(self, board, n):
- print "DummyScrapper getting list for", board, n
- sys.stdout.flush()
- return self.get_file('list.html')
- def get_read_page(self, board, thread, range_expr = ''):
- print "DummyScrapper getting read page for", board, thread, range_expr
- sys.stdout.flush()
- return self.get_file('thread.html')
- #######
- #### ProgParser
- ####
- #### Routines for extracting data from fetched pages.
- ####
- class ProgParser:
- def parse_subject(self, subject):
- return [line.split('<>') for line in subject.split('\n')]
- subject_indeces = {
- 'title' : 0,
- 'opname' : 1,
- 'noicon' : 2, # wtf is this noicon.png?
- 'threadid' : 3,
- 'numposts' : 4,
- 'lastpostername' : 5,
- 'lastpostertime' : 6,
- 'numindeces' : 7,
- }
- def parse_list_page(self, page):
- rows = []
- reader = StackMachineHTMLParser(listPageReader(rows.append))
- reader.feed(page)
- reader.close()
- return rows
- def parse_read_page(self, page):
- values = { 'title' : '', 'posts' : []}
- def title_callback(value):
- values['title'] = value
- reader = StackMachineHTMLParser(threadPageReader(title_callback, values['posts'].append))
- reader.feed(page)
- reader.close()
- return values
- def parse_timestamp(self, string):
- try:
- (date_str, time_str) = string.split(' ')
- (year_str, month_str, day_str) = date_str.split('-')
- (hour_str, minute_str) = time_str.split(':')
- return int(dt.datetime(int(year_str), int(month_str), int(day_str), int(hour_str), int(minute_str)).strftime('%s'))
- except Exception:
- return None
- #######
- #### BEGIN PROGDB
- ####
- #### Reads and writes data to a sqlite database.
- ####
- class ProgDB:
- dummykey = 0
- def __init__(self, conn):
- self.conn = conn
- def commit(self):
- self.conn.commit()
- def create(self):
- cur = self.conn.cursor()
- cur.execute('CREATE TABLE '
- 'IF NOT EXISTS '
- 'threads (threadid integer,'
- 'title text,'
- 'board text,'
- 'firstposttime integer,'
- 'lastposttime integer,'
- 'lastbumptime integer,'
- 'numposts integer,'
- 'PRIMARY KEY (threadid))')
- cur.execute('CREATE TABLE '
- 'IF NOT EXISTS '
- 'posts (threadid integer,'
- 'postnum integer,'
- 'postername text,'
- 'postermeiru text,'
- 'postertrip text,'
- 'posterid text,'
- 'posterdate integer,'
- 'post text,'
- 'PRIMARY KEY (threadid, postnum),'
- 'FOREIGN KEY (threadid) REFERENCES threads(threadid))')
- # singleton table
- cur.execute('CREATE TABLE '
- 'IF NOT EXISTS '
- 'syncdata (dummykey integer,'
- 'lastsynctime integer,'
- 'PRIMARY KEY (dummykey))')
- cur.execute('INSERT OR IGNORE INTO syncdata VALUES (?, NULL)', (ProgDB.dummykey,))
- cur.execute('CREATE UNIQUE INDEX IF NOT EXISTS threads_id on threads(threadid)')
- cur.execute('CREATE INDEX IF NOT EXISTS threads_lastbumptime on threads(board, lastbumptime)')
- cur.execute('CREATE INDEX IF NOT EXISTS threads_lastposttime on threads(board, lastposttime)')
- cur.execute('CREATE UNIQUE INDEX IF NOT EXISTS posts_id on posts(threadid, postnum)')
- def write_thread(self, threadid, title, board, firstposttime, lastposttime, lastbumptime, numposts):
- cur = self.conn.cursor()
- cur.execute('INSERT OR IGNORE INTO threads values (?,?,?,?,?,?,?)',
- (threadid, title, board, firstposttime, lastposttime, lastbumptime, numposts))
- def write_empty_thread(self, threadid, title, board):
- self.write_thread(threadid, title, board, 0, 0, 0, 0)
- def update_thread(self, threadid, firstposttime, lastposttime, lastbumptime, numposts):
- cur = self.conn.cursor()
- cur.execute('UPDATE threads SET firstposttime = ?,'
- 'lastposttime = ?,'
- 'lastbumptime = ?,'
- 'numposts = max(numposts, ?)'
- 'WHERE threadid = ?'
- 'LIMIT 1',
- (firstposttime, lastposttime, lastbumptime, numposts, threadid))
- def insert_post(self, threadid, postnum, postername, postermeiru, postertrip, posterid, posterdate, post):
- cur = self.conn.cursor()
- cur.execute('INSERT OR IGNORE INTO posts values (?,?,?,?,?,?,?,?)',
- (threadid, postnum, postername, postermeiru, postertrip, posterid, posterdate, post) )
- def write_post(self, threadid, postnum, postername, postermeiru, postertrip, posterid, posterdate, post):
- self.insert_post(threadid, postnum, postername, postermeiru, postertrip, posterid, posterdate, post)
- cur = self.conn.cursor()
- cur.execute('UPDATE threads SET lastposttime = max(lastposttime, ?),'
- 'numposts = max(numposts, ?)'
- 'WHERE threadid = ?'
- 'LIMIT 1',
- (posterdate, postnum, threadid))
- if postermeiru != 'mailto:sage':
- cur.execute('UPDATE threads SET lastbumptime = max(lastbumptime, ?)'
- 'WHERE threadid = ?',
- (posterdate, threadid))
- if postnum == 1:
- cur.execute('UPDATE threads SET firstposttime = ?'
- 'WHERE threadid = ?',
- (posterdate, threadid))
- def get_threads_by_board(self, board, limit = None, offset = None):
- cur = self.conn.cursor()
- if limit == None and offset == None:
- cur.execute('select * from threads where threads.board = ? ORDER BY threads.lastbumptime DESC',
- (board,))
- elif offset == None:
- cur.execute('select * from threads where threads.board = ? ORDER BY threads.lastbumptime DESC limit ?',
- (board, limit))
- else:
- cur.execute('select * from threads where threads.board = ? ORDER BY threads.lastbumptime DESC limit ? offset ?',
- (board, limit, offset))
- return cur
- def get_posts_by_thread(self, threadid, limit = None, offset = None):
- cur = self.conn.cursor()
- if limit == None and offset == None:
- cur.execute('select * from posts where posts.threadid = ? order by posts.postnum',
- (threadid,))
- elif offset == None:
- cur.execute('select * from posts where posts.threadid = ? order by posts.postnum limit ?',
- (threadid, limit))
- else:
- cur.execute('select * from posts where posts.threadid = ? order by posts.postnum limit ? offset ?',
- (threadid, limit, offset))
- return cur
- def get_thread(self, threadid):
- cur = self.conn.cursor()
- cur.execute('select * from threads where threads.threadid = ?', (threadid,))
- return cur.fetchone()
- def get_post(self, threadid, postnum):
- cur = self.conn.cursor()
- cur.execute('select * from posts where posts.threadid = ? and posts.postnum = ?', (threadid, postnum))
- return cur.fetchone()
- class ThreadEditor:
- def __init__(self, db, threadid, firstposttime, lastposttime, lastbumptime, numposts):
- self.db = db
- self.threadid = threadid
- self.firstposttime = firstposttime
- self.lastposttime = lastposttime
- self.lastbumptime = lastbumptime
- self.numposts = numposts
- def add_post(self, postnum, postermeiru, posterdate):
- if postnum == 1:
- self.firstposttime = posterdate
- self.lastposttime = max(self.lastposttime, posterdate)
- if postermeiru != 'mailto:sage':
- self.lastbumptime = max(self.lastbumptime, posterdate)
- self.numposts = max(self.numposts, postnum)
- def write_back(self):
- self.db.update_thread(self.threadid, self.firstposttime, self.lastposttime, self.lastbumptime, self.numposts)
- def get_thread_editor(self, threadid):
- (threadid_, title, board, firstposttime, lastposttime, lastbumptime, numposts) = self.get_thread(threadid)
- assert(threadid_ == threadid)
- return ProgDB.ThreadEditor(self, threadid, firstposttime, lastposttime, lastbumptime, numposts)
- def get_syncdata(self):
- cur = self.conn.cursor()
- cur.execute('SELECT * FROM syncdata WHERE dummykey = ? LIMIT 1', (ProgDB.dummykey,))
- return cur.fetchone()
- def set_lastsynctime(self, lastsynctime):
- cur = self.conn.cursor()
- cur.execute('UPDATE syncdata SET lastsynctime = ?'
- 'WHERE dummykey = ?',
- (lastsynctime, ProgDB.dummykey))
- def get_lastsynctime(self):
- (dummykey, lastsynctime) = self.get_syncdata()
- return lastsynctime
- def current_progtime(self):
- return int((dt.datetime.utcnow() - dt.timedelta(hours=4)).strftime('%s'))
- thread_keys = ('threadid', 'title', 'board', 'firstposttime', 'lastposttime', 'lastbumptime', 'numposts')
- post_keys = ('threadid', 'postnum', 'postername', 'postermeiru', 'postertrip', 'posterid', 'posterdate', 'post')
- def reverse_array_map(arr, inverse_map):
- for i in xrange(0, len(arr)):
- inverse_map[arr[i]] = i
- thread_key_indeces = {}
- reverse_array_map(thread_keys, thread_key_indeces)
- post_key_indeces = {}
- reverse_array_map(post_keys, post_key_indeces)
- def print_row(keys, values, indent = ''):
- assert(len(keys) == len(values))
- print indent, [(keys[i], values[i]) for i in xrange(0,len(keys))]
- sys.stdout.flush()
- def printDB(progdb):
- cur = progdb.get_threads_by_board('prog')
- for thread in cur:
- threadid = thread[0]
- print_row(thread_keys, thread)
- posts = progdb.get_posts_by_thread(threadid)
- for post in posts:
- print_row(post_keys, post, ' ')
- #######
- #### BEGIN ProgSynch
- ####
- #### Takes data from ProgScraper and updates a ProgDB.
- ####
- class ProgSynch:
- def __init__(self, scraper, parser, db):
- self.scraper = scraper
- self.parser = parser
- self.db = db
- self.abortOp = False
- def abort(self):
- self.abortOp = True
- def reset(self):
- self.abortOp = False
- def commit(self):
- self.db.commit()
- def write_thread_page_data(self, board, threadid, page, expected_postnum_start = 1):
- parsed_page = self.parser.parse_read_page(page)
- if not 'title' in parsed_page:
- print "warning: malformed thread, could not read title:", parsed_page
- sys.stdout.flush()
- self.db.write_empty_thread(threadid, parsed_page.get('title', ''), board)
- editor = self.db.get_thread_editor(threadid)
- backup_postnum = expected_postnum_start
- for post in parsed_page['posts']:
- postnum_valid = False
- # keep a post num count for recovery
- try:
- postnum = int(post['postnum'])
- backup_postnum = postnum
- postnum_valid = True
- except Exception:
- postnum = backup_postnum
- backup_postnum += 1
- postername = post.get('postername', '')
- postermeiru = post.get('postermeiru', '')
- postertrip = post.get('postertrip', '')
- posterid = post.get('posterid', '')
- posterdate = post.get('posterdate', '')
- text = post.get('post', '')
- if not postnum_valid or \
- not 'postername' in post or \
- not 'postertrip' in post or \
- not 'posterid' in post or \
- not 'posterdate' in post or \
- not 'post' in post:
- print "warning: malformed post in read page.", post
- sys.stdout.flush()
- editor.add_post(postnum, postermeiru, posterdate)
- self.db.insert_post(threadid, postnum, postername, postermeiru, postertrip, posterid, posterdate, text)
- editor.write_back()
- def write_thread_page_file(self, board, threadid, filename):
- file = open(filename, 'r')
- page = getdata(file)
- file.close()
- self.write_thread_page_data(board, threadid, page)
- def write_thread_page(self, board, threadid, range_expr = ''):
- page = self.scraper.get_read_page(board, threadid, range_expr)
- self.write_thread_page_data(board, threadid, page)
- def synch_subject_page_data(self, board, page):
- parsed_page = self.parser.parse_subject(page)
- modified = False
- current_synctime = self.db.current_progtime()
- lastsynctime = self.db.get_lastsynctime()
- for entry in parsed_page:
- if self.abortOp:
- break
- try:
- assert(len(entry) == ProgParser.subject_indeces['numindeces'])
- threadid = int(entry[ProgParser.subject_indeces['threadid']])
- numposts = int(entry[ProgParser.subject_indeces['numposts']])
- lastposttime = int(entry[ProgParser.subject_indeces['lastpostertime']])
- local_thread = self.db.get_thread(threadid)
- if local_thread == None:
- # thread does not exist in local storage. Synch the entire thread.
- #print "local thread not found", threadid, numposts
- self.write_thread_page(board, threadid)
- modifed = True
- else:
- local_numposts = int(local_thread[thread_key_indeces['numposts']])
- #print "local thread found", threadid, "local", local_numposts, "subject", numposts, "last post time", lastposttime, "last sync time", lastsynctime
- if lastsynctime <= lastposttime and local_numposts < numposts:
- # new posts have appeared in the thread. Synch the new posts.
- self.write_thread_page(board, threadid, str(local_numposts + 1) + '-')
- modifed = True
- except Exception as e:
- print "malformed entry in subject page. Could not synch:", entry
- print e.message, e.args
- sys.stdout.flush()
- self.db.set_lastsynctime(current_synctime)
- return modified
- def synch_subject_page_file(self, board, filename):
- file = open(filename, 'r')
- page = getdata(file)
- file.close()
- return self.synch_subject_page_data(board, page)
- def synch_subject_page(self, board):
- page = self.scraper.get_subject(board)
- return self.synch_subject_page_data(board, page)
- def synch_list_page_data(self, board, page):
- parsed_page = self.parser.parse_list_page(page)
- lastsynchtime = self.db.get_lastsynctime()
- modified = False
- for entry in parsed_page:
- if self.abortOp:
- break
- try:
- link = entry['link']
- numposts = int(entry['numposts'])
- last_post_time = entry['last_post_time']
- last_post_timestamp = self.parser.parse_timestamp(last_post_time)
- prefix = 'read/' + board + '/'
- assert(link.startswith(prefix))
- threadid = int(link[len(prefix):])
- local_thread = self.db.get_thread(threadid)
- if local_thread == None:
- # thread does not exist in local db. Synch entire thread.
- self.write_thread_page(board, threadid)
- modified = True
- else:
- local_numposts = local_thread[thread_key_indeces['numposts']]
- if (last_post_timestamp == None or lastsynctime < last_post_timestamp) and \
- local_numposts < numposts:
- # synch the new posts
- self.write_thread_page(board, threadid, str(local_numposts + 1) + '-')
- except Exception as e:
- print "error: malformed entry in list page. Could not synch:", entry
- print e.message, e.args
- sys.stdout.flush()
- return modified
- def synch_list_page_file(self, board, filename):
- file = open(filename, 'r')
- page = getdata(file)
- file.close()
- return self.synch_list_page_data(board, page)
- def synch_list_page(self, board, list_page_num):
- page = self.scraper.get_list_page(board, list_page_num)
- return self.synch_list_page_data(board, page)
- def synch_top_list_pages(self, board, num_pages):
- for i in xrange(1,num_pages + 1):
- if self.abortOp:
- break
- self.synch_list_page(board, i)
- def synch_list_pages_while_modifying(self, board):
- modified = True # kick start
- index = 1
- while modified:
- if self.abortOp:
- break
- modified = self.synch_list_page(board, index)
- index += 1
- def import_threads_dir(self, board, dirname, filename_to_threadid = lambda filename: int(filename)):
- for filename in os.listdir(dirname):
- if self.abortOp:
- break
- print "importing thread", filename
- sys.stdout.flush()
- threadid = filename_to_threadid(filename)
- self.write_thread_page_file(board, threadid, dirname + '/' + filename)
- def import_nested_threads_dir(self, board, dirname, filename_to_threadid = lambda filename: int(filename)):
- for (path, subdirs, filenames) in os.walk(dirname):
- if self.abortOp:
- break
- print "importing all threads under ", path
- sys.stdout.flush()
- for filename in filenames:
- if self.abortOp:
- break
- threadid = 0
- try:
- threadid = filename_to_threadid(filename)
- except Exception:
- continue
- filepath = path + '/' + filename
- print "importing thread", filepath
- sys.stdout.flush()
- self.write_thread_page_file(board, threadid, filepath)
- #######
- #### BEGIN ProgAsynch
- ####
- #### Issues async commands to a ProgSync and can abort commands.
- ####
- class ProgAsynch:
- def __init__(self, sync):
- self.sync = sync
- self.running_thread = None
- def start_command(self, command):
- if self.running_thread == None:
- self.sync.reset()
- self.running_thread = thrd.Thread(target=command,args=(self.sync,))
- self.running_thread.start()
- return True
- else:
- # another command is running...
- return False
- def abort(self):
- if self.running_thread != None:
- self.sync.abort()
- def is_done(self):
- return self.running_thread == None or not self.running_thread.is_alive()
- def wait(self):
- if self.running_thread != None:
- self.running_thread.join()
- self.running_thread = None
- def parser_test1():
- scraper = DummyScraper()
- parser = ProgParser()
- page = scraper.get_read_page('prog', 1337)
- parsed_page = parser.parse_read_page(page)
- print "parsed page"
- print parsed_page
- def synch_test1():
- scraper = DummyScraper()
- parser = ProgParser()
- dbname = 'prog.db'
- dbconn = sq.connect(dbname)
- db = ProgDB(dbconn)
- db.create()
- synch = ProgSynch(scraper, parser, db)
- synch.write_thread_page('prog', 1337)
- synch.commit()
- #printDB(db)
- dbconn.close()
- def synch_test3():
- scraper = DummyScraper()
- parser = ProgParser()
- dbname = 'prog.db'
- dbconn = sq.connect(dbname)
- db = ProgDB(dbconn)
- db.create()
- synch = ProgSynch(scraper, parser, db)
- synch.synch_subject_page('prog')
- synch.commit()
- #printDB(db)
- dbconn.close()
- def synch_test4():
- scraper = DummyScraper()
- parser = ProgParser()
- dbname = 'prog.db'
- dbconn = sq.connect(dbname)
- db = ProgDB(dbconn)
- db.create()
- synch = ProgSynch(scraper, parser, db)
- synch.synch_top_list_pages('prog', 5)
- synch.commit()
- #printDB(db)
- dbconn.close()
- def synch_test5():
- scraper = DummyScraper()
- parser = ProgParser()
- dbname = 'prog.db'
- dbconn = sq.connect(dbname)
- db = ProgDB(dbconn)
- db.create()
- synch = ProgSynch(scraper, parser, db)
- def filename_to_threadid(filename):
- assert(filename.endswith('.html'))
- id = int(filename[0:len(filename) - len('.html')])
- return id
- synch.import_threads_dir('prog', 'threads2', filename_to_threadid)
- synch.commit()
- #printDB(db)
- dbconn.close()
- def synch_test6():
- scraper = DummyScraper()
- parser = ProgParser()
- dbname = 'prog.db'
- dbconn = sq.connect(dbname)
- db = ProgDB(dbconn)
- db.create()
- synch = ProgSynch(scraper, parser, db)
- def filename_to_threadid(filename):
- assert(filename.endswith('.html'))
- return int(filename[0:len(filename) - len('.html')])
- synch.import_nested_threads_dir('prog', 'threads', filename_to_threadid)
- synch.commit()
- #printDB(db)
- dbconn.close()
- def demo1():
- conn_gen = lambda: http.HTTPSConnection('dis.4chan.org', timeout=20)
- scraper = ProgScraper(conn_gen)
- parser = ProgParser()
- dbname = 'prog.db'
- dbconn = sq.connect(dbname)
- db = ProgDB(dbconn)
- db.create()
- synch = ProgSynch(scraper, parser, db)
- synch.synch_top_list_pages('prog', 5)
- synch.commit()
- #printDB(db)
- dbconn.close()
- # sync all of /prog/
- def demo2():
- conn_gen = lambda: http.HTTPSConnection('dis.4chan.org', timeout=20)
- scraper = ProgScraper(conn_gen)
- parser = ProgParser()
- dbname = 'prog.db'
- dbconn = sq.connect(dbname)
- db = ProgDB(dbconn)
- db.create()
- synch = ProgSynch(scraper, parser, db)
- synch.synch_subject_page('prog')
- synch.commit()
- #printDB(db)
- dbconn.close()
- # sync from dummy subject but then stop after 30 seconds.
- def async_test1():
- scraper = DummyScraper()
- parser = ProgParser()
- dbname = 'prog.db'
- dbconn = sq.connect(dbname, check_same_thread=False)
- db = ProgDB(dbconn)
- db.create()
- synch = ProgSynch(scraper, parser, db)
- asynch = ProgAsynch(synch)
- def command(synch):
- synch.synch_subject_page('prog')
- synch.commit()
- asynch.start_command(command)
- tm.sleep(30)
- asynch.abort()
- asynch.wait()
- #printDB(db)
- dbconn.close()
Advertisement
Add Comment
Please, Sign In to add comment