sync script

#!/usr/bin/python
# coding=utf8

# Copyright (c) 2013, 4d9c64076ff3943f1d6645371b0db6f9b0576a6b7f7c1dfc2c0aa3da65afd74a
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#     * Redistributions of source code must retain the above copyright
#       notice, this list of conditions and the following disclaimer.
#     * Redistributions in binary form must reproduce the above copyright
#       notice, this list of conditions and the following disclaimer in the
#       documentation and/or other materials provided with the distribution.
#     * The names of its contributors may not be used to endorse or promote
#       products derived from this software without specific prior written
#       permission.
#     * The source may only be modified by a programmer that has reflected on
#       the existence of magical fairies for a period no less than 10 seconds.
#       After performing this exercise, the programmer may edit the source for
#       up to 4 hours, after which the fairy affirmation exercise must be
#       repeated to renew their permission to edit.
#
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL 4d9c64076ff3943f1d6645371b0db6f9b0576a6b7f7c1dfc2c0aa3da65afd74a
# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION; OR INJURY INDUCED BY
# FAIRY HALLUCINATIONS) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
# SUCH DAMAGE.

import HTMLParser as html
import httplib as http
import sqlite3 as sq
import time as tm
import datetime as dt
import random as ra
import sys
import os
import threading as thrd
import thread as thd


class MonkeyPatchHTMLParser(html.HTMLParser):

    # changed recovery for bad &# sequences.
    def goahead(self, end):
        rawdata = self.rawdata
        i = 0
        n = len(rawdata)
        while i < n:
            match = self.interesting.search(rawdata, i) # < or &
            if match:
                j = match.start()
            else:
                if self.cdata_elem:
                    break
                j = n
            if i < j: self.handle_data(rawdata[i:j])
            i = self.updatepos(i, j)
            if i == n: break
            startswith = rawdata.startswith
            if startswith('<', i):
                if html.starttagopen.match(rawdata, i): # < + letter
                    k = self.parse_starttag(i)
                elif startswith("</", i):
                    k = self.parse_endtag(i)
                elif startswith("<!--", i):
                    k = self.parse_comment(i)
                elif startswith("<?", i):
                    k = self.parse_pi(i)
                elif startswith("<!", i):
                    k = self.parse_html_declaration(i)
                elif (i + 1) < n:
                    self.handle_data("<")
                    k = i + 1
                else:
                    break
                if k < 0:
                    if not end:
                        break
                    k = rawdata.find('>', i + 1)
                    if k < 0:
                        k = rawdata.find('<', i + 1)
                        if k < 0:
                            k = i + 1
                    else:
                        k += 1
                    self.handle_data(rawdata[i:k])
                i = self.updatepos(i, k)
            elif startswith("&#", i):
                match = html.charref.match(rawdata, i)
                if match:
                    name = match.group()[2:-1]
                    self.handle_charref(name)
                    k = match.end()
                    if not startswith(';', k-1):
                        k = k - 1
                    i = self.updatepos(i, k)
                    continue
                else:
                    # BEGIN MONKEY PATCH
                    self.handle_data(rawdata[i:i+2])
                    i = self.updatepos(i, i+2)
                    continue
                    # END MONKEY PATCH
            elif startswith('&', i):
                match = html.entityref.match(rawdata, i)
                if match:
                    name = match.group(1)
                    self.handle_entityref(name)
                    k = match.end()
                    if not startswith(';', k-1):
                        k = k - 1
                    i = self.updatepos(i, k)
                    continue
                match = html.incomplete.match(rawdata, i)
                if match:
                    # match.group() will contain at least 2 chars
                    if end and match.group() == rawdata[i:]:
                        self.error("EOF in middle of entity or char ref")
                    # incomplete
                    break
                elif (i + 1) < n:
                    # not the end of the buffer, and can't be confused
                    # with some other construct
                    self.handle_data("&")
                    i = self.updatepos(i, i + 1)
                else:
                    break
            else:
                assert 0, "interesting.search() lied"
        # end while
        if end and i < n and not self.cdata_elem:
            self.handle_data(rawdata[i:n])
            i = self.updatepos(i, n)
        self.rawdata = rawdata[i:]

#######
#### BEGIN HTML PARSER
####

class StackMachineHTMLParser(MonkeyPatchHTMLParser):
   def __init__(self, handler):
      html.HTMLParser.__init__(self)
      self.root_handler = handler
      self.handlers = []
      self.root_handler.beginParsing(self)
   def push_handler(self, outer_tag, handler):
      handler.beginParsing(self)
      self.handlers.append((outer_tag, handler))
   def pop_handler(self):
      self.handlers.pop()[1].endParsing(self)
   def top_handler(self):
      if len(self.handlers) > 0:
         return self.handlers[-1][1]
      else:
         return self.root_handler
   def handle_starttag(self, tag, attrs):
      #print "( start tag", tag, attrs
      self.push_handler(tag, self.top_handler().start(self, tag, attrs))
   def handle_startendtag(self, tag, attrs):
      #print "() startend tag", tag, attrs
      self.top_handler().startend(self, tag, attrs)
   def handle_data(self, data):
      #print "= data", data
      self.top_handler().data(self, data)
   def handle_entityref(self, name):
      #print "= entity", name
      self.top_handler().entityref(self, name)
   def handle_charref(self, name):
      #print "= char", name
      self.top_handler().charref(self, name)
   def handle_endtag(self, tag):
      #print ") end tag", tag, self.handlers[-1][0]
      if len(self.handlers) > 0:
         i = len(self.handlers) - 1
         while i >= 0:
            if self.handlers[i][0] == tag:
               break
            i -= 1
         if i != -1:
            while len(self.handlers) > i:
               self.pop_handler()
            self.top_handler().end(self, tag)
   def done(self):
      self.root_handler.endParsing(self)

def attributeGet(attrs, key):
   for attr in attrs:
      if attr[0] == key:
         return attr[1]
   return None

def getdata(f):
   return f.read().decode('utf8', 'replace')

class NullHandler:
   def beginParsing(self, parser): pass
   def start(self, parser, tag, attrs):
      return null_handler
   def startend(self, parser, tag, attrs): pass
   def data(self, parser, data): pass
   def entityref(self, parser, name): pass
   def charref(self, parser, name): pass
   def end(self, parse, tag): pass
   def endParsing(self, parser): pass

null_handler = NullHandler()

class BaseHandler(NullHandler):
   def startend(self, parser, tag, attrs):
      handler = self.start(parser, tag, attrs)
      handler.beginParsing(parser)
      handler.endParsing(parser)
      self.end(parser, tag)
   def entityref(self, parser, name):
      self.data(parser, '&' + name + ';')
   def charref(self, parser, name):
      self.data(parser, '&#' + name + ';')

class TextHandler(BaseHandler):
   def __init__(self, callback, done_callback):
      self.callback = callback
      self.done_callback = done_callback
   def print_attrs(self, attrs, closing_tag):
      for attr in attrs:
         if attr[1] == None:
            # the parser thinks the attribute has no value...
            self.callback(attr[0])
         else:
            self.callback(' ')
            self.callback(attr[0])
            self.callback('="')
            self.callback(attr[1].replace('"', '\\"').replace("\\","\\\\"))
            self.callback('"')
      self.callback(closing_tag)
   def start(self, parser, tag, attrs):
      self.callback('<')
      self.callback(tag)
      self.print_attrs(attrs, '>')
      if self.done_callback == None:
         return self
      else:
         return TextHandler(self.callback, None)
   def startend(self, parser, tag, attrs):
      self.callback('<')
      self.callback(tag)
      self.print_attrs(attrs, '/>')
   def data(self, parser, data):
      self.callback(data)
   def end(self, parser, tag):
      self.callback('</')
      self.callback(tag)
      self.callback('>')
   def endParsing(self, parser):
      if self.done_callback != None:
         self.done_callback()

def textGatherer(callback):
   strings = []
   def done():
      callback(''.join(strings))
      del strings[:]
   return TextHandler(strings.append, done)

def valueGatherer(dic, key):
   def callback(value):
      dic[key] = value
   return textGatherer(callback)

class MatchingHandler(BaseHandler):
   def __init__(self, match_tag, match_attrs, next_handler,
                done_callback = lambda: None,
                attributes_callback = lambda attrs: None):
      self.match_tag = match_tag
      self.match_attrs = match_attrs
      self.next_handler = next_handler
      self.done_callback = done_callback
      self.attributes_callback = attributes_callback
   def start(self, parser, tag, attrs):
      attrs_match = True
      for match_attr in self.match_attrs:
         if not match_attr in attrs:
            attrs_match = False
            break
      if tag == self.match_tag and attrs_match:
         self.attributes_callback(attrs)
         return self.next_handler
      else:
         return null_handler
   def endParsing(self, parser):
      self.done_callback()

class MultipleMatchHandler(BaseHandler):
   def __init__(self, cases, done_callback = lambda: None):
      self.cases = cases
      self.done_callback = done_callback
      for case in cases:
         assert(len(case) == 3)
   def start(self, parser, tag, attrs):
      for case in self.cases:
         match_tag = case[0]
         match_attrs = case[1]
         next_handler = case[2]
         if len(case) > 3:
            attributes_callback = case[4]
         else:
            attributes_callback = lambda attrs: None
         attrs_match = True
         for match_attr in match_attrs:
            if not match_attr in attrs:
               attrs_match = False
               break
         if tag == match_tag and attrs_match:
            attributes_callback(attrs)
            return next_handler
      return null_handler
   def endParsing(self, parsing):
      self.done_callback()

class SequentialMatchHandler(BaseHandler):
   def __init__(self, cases, done_callback = lambda: None):
      self.cases = cases
      self.index = 0
      self.done_callback = done_callback
      for case in cases:
         assert(len(case) == 3)
   def start(self, parser, tag, attrs):
      if self.index >= len(self.cases):
         return null_handler
      case = self.cases[self.index]
      match_tag = case[0]
      match_attrs = case[1]
      next_handler = case[2]
      attrs_match = True
      for match_attr in match_attrs:
         if not match_attr in attrs:
            attrs_match = False
            break
      if tag == match_tag and attrs_match:
         self.index += 1
         return next_handler
      return null_handler
   def endParsing(self, parser):
      self.done_callback()
      self.index = 0

class DataOverrider(BaseHandler):
   def __init__(self, data_handler, tags_handler, done_callback = None):
      self.data_handler = data_handler
      self.tags_handler = tags_handler
      if done_callback == None:
         self.done_callback = tags_handler.endParsing
      else:
         self.done_callback = done_callback
   def beginParsing(self, parser):
      self.tags_handler.beginParsing(parser)
   def start(self, parser, tag, attrs):
      return self.tags_handler.start(parser, tag, attrs)
   def startend(self, parser, tag, attrs):
      self.tags_handler.startend(parser, tag, attrs)
   def data(self, parser, data):
      self.data_handler.beginParsing(parser)
      self.data_handler.data(parser, data)
      self.data_handler.endParsing(parser)
   def end(self, parser, tag):
      return self.tags_handler.end(parser, tag)
   def endParsing(self, parser):
      self.done_callback(parser)

def listPageRowReader(row_callback):
   values = {}
   def linkGetter(attrs):
      val = attributeGet(attrs, 'href')
      if val != None:
         values['link'] = val
   def done():
      row_callback(values.copy())
      values.clear()
   return SequentialMatchHandler([
         ['td', [], null_handler],
         ['td', [],
           MatchingHandler('a', [],
              valueGatherer(values, 'title'),
              lambda: None,
              linkGetter)],
         ['td', [], valueGatherer(values, 'numposts')],
         ['td', [],
            MatchingHandler('small', [],
               valueGatherer(values, 'last_post_time'))]],
       done)

def postReader(post_callback):
   name_values = {}
   values = {}
   def meiruGetter(attrs):
      val = attributeGet(attrs, 'href')
      if val != None:
         values['postermeiru'] = val
   def name_done(parser):
      values['postername'] = name_values.get('linked_name', name_values.get('unlinked_name', ''))
   def done():
      post_callback(values.copy())
      values.clear()
      name_values.clear()
   return SequentialMatchHandler([
         ['h3', [],
            SequentialMatchHandler([
               ['span', [('class', 'postnum')],
                  MatchingHandler('a', [], valueGatherer(values, 'postnum'))],
               ['span', [('class', 'postinfo')],
                  SequentialMatchHandler([
                     ['span', [('class', 'postername')],
                        DataOverrider(
                           valueGatherer(name_values, 'unlinked_name'),
                           MatchingHandler('a', [],
                              valueGatherer(name_values, 'linked_name'),
                              lambda: None,
                              meiruGetter),
                           name_done)],
                     ['span', [('class', 'postertrip')], valueGatherer(values, 'postertrip')],
                     ['span', [('class', 'id')], valueGatherer(values, 'posterid')],
                     ['span', [('class', 'posterdate')], valueGatherer(values, 'posterdate')]])]])],
         ['blockquote', [],
            MatchingHandler('p', [],
               valueGatherer(values, 'post'))]],
      done)

def listPageReader(row_callback, done_callback = lambda: None):
   return MatchingHandler('html', [],
      MatchingHandler('body', [],
         MatchingHandler('div', [('class', 'hborder')],
            MatchingHandler('div', [('class', 'head')],
               MatchingHandler('table', [('class', 'threads')],
                  MatchingHandler('tbody', [],
                     MatchingHandler('tr', [],
                        listPageRowReader(row_callback))))))),
      done_callback)

def threadPageReader(title_callback, post_callback, done_callback = lambda: None):
   post_reader = postReader(post_callback)
   return MatchingHandler('html', [],
         MatchingHandler('body', [],
            MultipleMatchHandler([
               ['h2', [], textGatherer(title_callback)],
               ['div', [('class', 'thread')],
                  MultipleMatchHandler([
                     ['div', [('class', 'post even')], post_reader],
                     ['div', [('class', 'post odd')], post_reader]])]])),
         done_callback)

#######
#### ProgScraper
####
#### Creates urls and downloads pages using a http(s) connection.
####

class ScraperException: pass

class BadResponse(ScraperException):
   def __init__(self,response):
      self.response = response

class ProgScraper:
   def __init__(self, conn_gen, min_req_delay = 0, max_req_delay = 2):
      self.conn_gen = conn_gen
      self.min_req_delay = min_req_delay
      self.max_req_delay = max_req_delay
      self.rand = ra.Random()
   def read_get(self, url):
      tm.sleep(self.rand.randint(self.min_req_delay, self.max_req_delay))
      print "starting download: ", url
      sys.stdout.flush()
      conn = self.conn_gen()
      conn.request('GET', url)
      response = conn.getresponse()
      if response.status == 200:
         print "finished download: ", url
         sys.stdout.flush()
         data = getdata(response)
         conn.close()
         return data
      else:
         print "bad response getting ", url, ":", response
         sys.stdout.flush()
         conn.close()
         raise BadResponse(response)
   def get_subject(self, board):
      return self.read_get('/' + board + '/subject.txt')
   def get_list_page(self, board, n):
      return self.read_get('/list/' + board + '/' + str(n))
   def get_read_page(self, board, thread, range_expr = ''):
      return self.read_get('/read/' + board + '/' + str(thread) + '/' + range_expr)

#######
#### DummyScraper
####
#### A dummy implementation of ProgScraper for testing that doesn't use any socket.
####

class DummyScraper:
   def get_file(self, filename):
      file = open(filename, 'r')
      data = getdata(file)
      file.close()
      return data
   def get_subject(self, board):
      print "DummyScrapper getting subject for", board
      sys.stdout.flush()
      return self.get_file('subject.txt')
   def get_list_page(self, board, n):
      print "DummyScrapper getting list for", board, n
      sys.stdout.flush()
      return self.get_file('list.html')
   def get_read_page(self, board, thread, range_expr = ''):
      print "DummyScrapper getting read page for", board, thread, range_expr
      sys.stdout.flush()
      return self.get_file('thread.html')

#######
#### ProgParser
####
#### Routines for extracting data from fetched pages.
####

class ProgParser:
   def parse_subject(self, subject):
      return [line.split('<>') for line in subject.split('\n')]
   subject_indeces = {
      'title' : 0,
      'opname' : 1,
      'noicon' : 2, # wtf is this noicon.png?
      'threadid' : 3,
      'numposts' : 4,
      'lastpostername' : 5,
      'lastpostertime' : 6,
      'numindeces' : 7,
   }
   def parse_list_page(self, page):
      rows = []
      reader = StackMachineHTMLParser(listPageReader(rows.append))
      reader.feed(page)
      reader.close()
      return rows
   def parse_read_page(self, page):
      values = { 'title' : '', 'posts' : []}
      def title_callback(value):
         values['title'] = value
      reader = StackMachineHTMLParser(threadPageReader(title_callback, values['posts'].append))
      reader.feed(page)
      reader.close()
      return values
   def parse_timestamp(self, string):
      try:
         (date_str, time_str) = string.split(' ')
         (year_str, month_str, day_str) = date_str.split('-')
         (hour_str, minute_str) = time_str.split(':')
         return int(dt.datetime(int(year_str), int(month_str), int(day_str), int(hour_str), int(minute_str)).strftime('%s'))
      except Exception:
         return None

#######
#### BEGIN PROGDB
####
#### Reads and writes data to a sqlite database.
####

class ProgDB:

   dummykey = 0

   def __init__(self, conn):
      self.conn = conn
   def commit(self):
      self.conn.commit()
   def create(self):
      cur = self.conn.cursor()
      cur.execute('CREATE TABLE '
                          'IF NOT EXISTS '
                          'threads (threadid integer,'
                                   'title text,'
                                   'board text,'
                                   'firstposttime integer,'
                                   'lastposttime integer,'
                                   'lastbumptime integer,'
                                   'numposts integer,'
                                   'PRIMARY KEY (threadid))')
      cur.execute('CREATE TABLE '
                          'IF NOT EXISTS '
                          'posts (threadid integer,'
                                 'postnum integer,'
                                 'postername text,'
                                 'postermeiru text,'
                                 'postertrip text,'
                                 'posterid text,'
                                 'posterdate integer,'
                                 'post text,'
                                 'PRIMARY KEY (threadid, postnum),'
                                 'FOREIGN KEY (threadid) REFERENCES threads(threadid))')
      # singleton table
      cur.execute('CREATE TABLE '
                          'IF NOT EXISTS '
                          'syncdata (dummykey integer,'
                                    'lastsynctime integer,'
                                    'PRIMARY KEY (dummykey))')
      cur.execute('INSERT OR IGNORE INTO syncdata VALUES (?, NULL)', (ProgDB.dummykey,))
      cur.execute('CREATE UNIQUE INDEX IF NOT EXISTS threads_id on threads(threadid)')
      cur.execute('CREATE INDEX IF NOT EXISTS threads_lastbumptime on threads(board, lastbumptime)')
      cur.execute('CREATE INDEX IF NOT EXISTS threads_lastposttime on threads(board, lastposttime)')
      cur.execute('CREATE UNIQUE INDEX IF NOT EXISTS posts_id on posts(threadid, postnum)')
   def write_thread(self, threadid, title, board, firstposttime, lastposttime, lastbumptime, numposts):
      cur = self.conn.cursor()
      cur.execute('INSERT OR IGNORE INTO threads values (?,?,?,?,?,?,?)',
                  (threadid, title, board, firstposttime, lastposttime, lastbumptime, numposts))
   def write_empty_thread(self, threadid, title, board):
      self.write_thread(threadid, title, board, 0, 0, 0, 0)
   def update_thread(self, threadid, firstposttime, lastposttime, lastbumptime, numposts):
      cur = self.conn.cursor()
      cur.execute('UPDATE threads SET firstposttime = ?,'
                                     'lastposttime = ?,'
                                     'lastbumptime = ?,'
                                     'numposts = max(numposts, ?)'
                                 'WHERE threadid = ?'
                                 'LIMIT 1',
                   (firstposttime, lastposttime, lastbumptime, numposts, threadid))
   def insert_post(self, threadid, postnum, postername, postermeiru, postertrip, posterid, posterdate, post):
      cur = self.conn.cursor()
      cur.execute('INSERT OR IGNORE INTO posts values (?,?,?,?,?,?,?,?)',
                  (threadid, postnum, postername, postermeiru, postertrip, posterid, posterdate, post) )
   def write_post(self, threadid, postnum, postername, postermeiru, postertrip, posterid, posterdate, post):
      self.insert_post(threadid, postnum, postername, postermeiru, postertrip, posterid, posterdate, post)
      cur = self.conn.cursor()
      cur.execute('UPDATE threads SET lastposttime = max(lastposttime, ?),'
                                     'numposts = max(numposts, ?)'
                                 'WHERE threadid = ?'
                                 'LIMIT 1',
                  (posterdate, postnum, threadid))
      if postermeiru != 'mailto:sage':
         cur.execute('UPDATE threads SET lastbumptime = max(lastbumptime, ?)'
                                    'WHERE threadid = ?',
                     (posterdate, threadid))
      if postnum == 1:
         cur.execute('UPDATE threads SET firstposttime = ?'
                                    'WHERE threadid = ?',
                     (posterdate, threadid))
   def get_threads_by_board(self, board, limit = None, offset = None):
      cur = self.conn.cursor()
      if limit == None and offset == None:
         cur.execute('select * from threads where threads.board = ? ORDER BY threads.lastbumptime DESC',
                     (board,))
      elif offset == None:
         cur.execute('select * from threads where threads.board = ? ORDER BY threads.lastbumptime DESC limit ?',
                     (board, limit))
      else:
         cur.execute('select * from threads where threads.board = ? ORDER BY threads.lastbumptime DESC limit ? offset ?',
                     (board, limit, offset))
      return cur
   def get_posts_by_thread(self, threadid, limit = None, offset = None):
      cur = self.conn.cursor()
      if limit == None and offset == None:
         cur.execute('select * from posts where posts.threadid = ? order by posts.postnum',
                     (threadid,))
      elif offset == None:
         cur.execute('select * from posts where posts.threadid = ? order by posts.postnum limit ?',
                     (threadid, limit))
      else:
         cur.execute('select * from posts where posts.threadid = ? order by posts.postnum limit ? offset ?',
                     (threadid, limit, offset))
      return cur
   def get_thread(self, threadid):
      cur = self.conn.cursor()
      cur.execute('select * from threads where threads.threadid = ?', (threadid,))
      return cur.fetchone()
   def get_post(self, threadid, postnum):
      cur = self.conn.cursor()
      cur.execute('select * from posts where posts.threadid = ? and posts.postnum = ?', (threadid, postnum))
      return cur.fetchone()

   class ThreadEditor:
      def __init__(self, db, threadid, firstposttime, lastposttime, lastbumptime, numposts):
         self.db = db
         self.threadid = threadid
         self.firstposttime = firstposttime
         self.lastposttime = lastposttime
         self.lastbumptime = lastbumptime
         self.numposts = numposts
      def add_post(self, postnum, postermeiru, posterdate):
         if postnum == 1:
            self.firstposttime = posterdate
         self.lastposttime = max(self.lastposttime, posterdate)
         if postermeiru != 'mailto:sage':
            self.lastbumptime = max(self.lastbumptime, posterdate)
         self.numposts = max(self.numposts, postnum)
      def write_back(self):
         self.db.update_thread(self.threadid, self.firstposttime, self.lastposttime, self.lastbumptime, self.numposts)

   def get_thread_editor(self, threadid):
      (threadid_, title, board, firstposttime, lastposttime, lastbumptime, numposts) = self.get_thread(threadid)
      assert(threadid_ == threadid)
      return ProgDB.ThreadEditor(self, threadid, firstposttime, lastposttime, lastbumptime, numposts)

   def get_syncdata(self):
      cur = self.conn.cursor()
      cur.execute('SELECT * FROM syncdata WHERE dummykey = ? LIMIT 1', (ProgDB.dummykey,))
      return cur.fetchone()

   def set_lastsynctime(self, lastsynctime):
      cur = self.conn.cursor()
      cur.execute('UPDATE syncdata SET lastsynctime = ?'
                                  'WHERE dummykey = ?',
                  (lastsynctime, ProgDB.dummykey))

   def get_lastsynctime(self):
      (dummykey, lastsynctime) = self.get_syncdata()
      return lastsynctime

   def current_progtime(self):
      return int((dt.datetime.utcnow() - dt.timedelta(hours=4)).strftime('%s'))


thread_keys = ('threadid', 'title', 'board', 'firstposttime', 'lastposttime', 'lastbumptime', 'numposts')
post_keys = ('threadid', 'postnum', 'postername', 'postermeiru', 'postertrip', 'posterid', 'posterdate', 'post')

def reverse_array_map(arr, inverse_map):
   for i in xrange(0, len(arr)):
      inverse_map[arr[i]] = i

thread_key_indeces = {}
reverse_array_map(thread_keys, thread_key_indeces)

post_key_indeces = {}
reverse_array_map(post_keys, post_key_indeces)

def print_row(keys, values, indent = ''):
   assert(len(keys) == len(values))
   print indent, [(keys[i], values[i]) for i in xrange(0,len(keys))]
   sys.stdout.flush()

def printDB(progdb):
   cur = progdb.get_threads_by_board('prog')
   for thread in cur:
      threadid = thread[0]
      print_row(thread_keys, thread)
      posts = progdb.get_posts_by_thread(threadid)
      for post in posts:
         print_row(post_keys, post, '    ')

#######
#### BEGIN ProgSynch
####
#### Takes data from ProgScraper and updates a ProgDB.
####

class ProgSynch:
   def __init__(self, scraper, parser, db):
      self.scraper = scraper
      self.parser = parser
      self.db = db
      self.abortOp = False
   def abort(self):
      self.abortOp = True
   def reset(self):
      self.abortOp = False
   def commit(self):
      self.db.commit()
   def write_thread_page_data(self, board, threadid, page, expected_postnum_start = 1):
      parsed_page = self.parser.parse_read_page(page)
      if not 'title' in parsed_page:
         print "warning: malformed thread, could not read title:", parsed_page
         sys.stdout.flush()
      self.db.write_empty_thread(threadid, parsed_page.get('title', ''), board)
      editor = self.db.get_thread_editor(threadid)
      backup_postnum = expected_postnum_start
      for post in parsed_page['posts']:
         postnum_valid = False
         # keep a post num count for recovery
         try:
            postnum = int(post['postnum'])
            backup_postnum = postnum
            postnum_valid = True
         except Exception:
            postnum = backup_postnum
         backup_postnum += 1
         postername = post.get('postername', '')
         postermeiru = post.get('postermeiru', '')
         postertrip = post.get('postertrip', '')
         posterid = post.get('posterid', '')
         posterdate = post.get('posterdate', '')
         text = post.get('post', '')
         if not postnum_valid or        \
            not 'postername' in post or \
            not 'postertrip' in post or \
            not 'posterid' in post or   \
            not 'posterdate' in post or \
            not 'post' in post:
            print "warning: malformed post in read page.", post
            sys.stdout.flush()
         editor.add_post(postnum, postermeiru, posterdate)
         self.db.insert_post(threadid, postnum, postername, postermeiru, postertrip, posterid, posterdate, text)
      editor.write_back()
   def write_thread_page_file(self, board, threadid, filename):
      file = open(filename, 'r')
      page = getdata(file)
      file.close()
      self.write_thread_page_data(board, threadid, page)
   def write_thread_page(self, board, threadid, range_expr = ''):
      page = self.scraper.get_read_page(board, threadid, range_expr)
      self.write_thread_page_data(board, threadid, page)
   def synch_subject_page_data(self, board, page):
      parsed_page = self.parser.parse_subject(page)
      modified = False
      current_synctime = self.db.current_progtime()
      lastsynctime = self.db.get_lastsynctime()
      for entry in parsed_page:
         if self.abortOp:
            break
         try:
            assert(len(entry) == ProgParser.subject_indeces['numindeces'])
            threadid = int(entry[ProgParser.subject_indeces['threadid']])
            numposts = int(entry[ProgParser.subject_indeces['numposts']])
            lastposttime = int(entry[ProgParser.subject_indeces['lastpostertime']])
            local_thread = self.db.get_thread(threadid)
            if local_thread == None:
               # thread does not exist in local storage. Synch the entire thread.
               #print "local thread not found", threadid, numposts
               self.write_thread_page(board, threadid)
               modifed = True
            else:
               local_numposts = int(local_thread[thread_key_indeces['numposts']])
               #print "local thread found", threadid, "local", local_numposts, "subject", numposts, "last post time", lastposttime, "last sync time", lastsynctime
               if lastsynctime <= lastposttime and local_numposts < numposts:
                  # new posts have appeared in the thread. Synch the new posts.
                  self.write_thread_page(board, threadid, str(local_numposts + 1) + '-')
                  modifed = True
         except Exception as e:
            print "malformed entry in subject page. Could not synch:", entry
            print e.message, e.args
            sys.stdout.flush()
      self.db.set_lastsynctime(current_synctime)
      return modified
   def synch_subject_page_file(self, board, filename):
      file = open(filename, 'r')
      page = getdata(file)
      file.close()
      return self.synch_subject_page_data(board, page)
   def synch_subject_page(self, board):
      page = self.scraper.get_subject(board)
      return self.synch_subject_page_data(board, page)
   def synch_list_page_data(self, board, page):
      parsed_page = self.parser.parse_list_page(page)
      lastsynchtime = self.db.get_lastsynctime()
      modified = False
      for entry in parsed_page:
         if self.abortOp:
            break
         try:
            link = entry['link']
            numposts = int(entry['numposts'])
            last_post_time = entry['last_post_time']
            last_post_timestamp = self.parser.parse_timestamp(last_post_time)
            prefix = 'read/' + board + '/'
            assert(link.startswith(prefix))
            threadid = int(link[len(prefix):])
            local_thread = self.db.get_thread(threadid)
            if local_thread == None:
               # thread does not exist in local db. Synch entire thread.
               self.write_thread_page(board, threadid)
               modified = True
            else:
               local_numposts = local_thread[thread_key_indeces['numposts']]
               if (last_post_timestamp == None or lastsynctime < last_post_timestamp) and \
                  local_numposts < numposts:
                  # synch the new posts
                  self.write_thread_page(board, threadid, str(local_numposts + 1) + '-')
         except Exception as e:
            print "error: malformed entry in list page. Could not synch:", entry
            print e.message, e.args
            sys.stdout.flush()
      return modified
   def synch_list_page_file(self, board, filename):
      file = open(filename, 'r')
      page = getdata(file)
      file.close()
      return self.synch_list_page_data(board, page)
   def synch_list_page(self, board, list_page_num):
      page = self.scraper.get_list_page(board, list_page_num)
      return self.synch_list_page_data(board, page)
   def synch_top_list_pages(self, board, num_pages):
      for i in xrange(1,num_pages + 1):
         if self.abortOp:
            break
         self.synch_list_page(board, i)
   def synch_list_pages_while_modifying(self, board):
      modified = True # kick start
      index = 1
      while modified:
         if self.abortOp:
            break
         modified = self.synch_list_page(board, index)
         index += 1
   def import_threads_dir(self, board, dirname, filename_to_threadid = lambda filename: int(filename)):
      for filename in os.listdir(dirname):
         if self.abortOp:
            break
         print "importing thread", filename
         sys.stdout.flush()
         threadid = filename_to_threadid(filename)
         self.write_thread_page_file(board, threadid, dirname + '/' + filename)
   def import_nested_threads_dir(self, board, dirname, filename_to_threadid = lambda filename: int(filename)):
      for (path, subdirs, filenames) in os.walk(dirname):
         if self.abortOp:
            break
         print "importing all threads under ", path
         sys.stdout.flush()
         for filename in filenames:
            if self.abortOp:
               break
            threadid = 0
            try:
               threadid = filename_to_threadid(filename)
            except Exception:
               continue
            filepath = path + '/' + filename
            print "importing thread", filepath
            sys.stdout.flush()
            self.write_thread_page_file(board, threadid, filepath)

#######
#### BEGIN ProgAsynch
####
#### Issues async commands to a ProgSync and can abort commands.
####

class ProgAsynch:
   def __init__(self, sync):
      self.sync = sync
      self.running_thread = None
   def start_command(self, command):
      if self.running_thread == None:
         self.sync.reset()
         self.running_thread = thrd.Thread(target=command,args=(self.sync,))
         self.running_thread.start()
         return True
      else:
         # another command is running...
         return False
   def abort(self):
      if self.running_thread != None:
         self.sync.abort()
   def is_done(self):
      return self.running_thread == None or not self.running_thread.is_alive()
   def wait(self):
      if self.running_thread != None:
         self.running_thread.join()
         self.running_thread = None

def parser_test1():
   scraper = DummyScraper()
   parser = ProgParser()
   page = scraper.get_read_page('prog', 1337)
   parsed_page = parser.parse_read_page(page)
   print "parsed page"
   print parsed_page

def synch_test1():
   scraper = DummyScraper()
   parser = ProgParser()
   dbname = 'prog.db'
   dbconn = sq.connect(dbname)
   db = ProgDB(dbconn)
   db.create()
   synch = ProgSynch(scraper, parser, db)
   synch.write_thread_page('prog', 1337)
   synch.commit()
   #printDB(db)
   dbconn.close()

def synch_test3():
   scraper = DummyScraper()
   parser = ProgParser()
   dbname = 'prog.db'
   dbconn = sq.connect(dbname)
   db = ProgDB(dbconn)
   db.create()
   synch = ProgSynch(scraper, parser, db)
   synch.synch_subject_page('prog')
   synch.commit()
   #printDB(db)
   dbconn.close()

def synch_test4():
   scraper = DummyScraper()
   parser = ProgParser()
   dbname = 'prog.db'
   dbconn = sq.connect(dbname)
   db = ProgDB(dbconn)
   db.create()
   synch = ProgSynch(scraper, parser, db)
   synch.synch_top_list_pages('prog', 5)
   synch.commit()
   #printDB(db)
   dbconn.close()

def synch_test5():
   scraper = DummyScraper()
   parser = ProgParser()
   dbname = 'prog.db'
   dbconn = sq.connect(dbname)
   db = ProgDB(dbconn)
   db.create()
   synch = ProgSynch(scraper, parser, db)
   def filename_to_threadid(filename):
      assert(filename.endswith('.html'))
      id = int(filename[0:len(filename) - len('.html')])
      return id
   synch.import_threads_dir('prog', 'threads2', filename_to_threadid)
   synch.commit()
   #printDB(db)
   dbconn.close()

def synch_test6():
   scraper = DummyScraper()
   parser = ProgParser()
   dbname = 'prog.db'
   dbconn = sq.connect(dbname)
   db = ProgDB(dbconn)
   db.create()
   synch = ProgSynch(scraper, parser, db)
   def filename_to_threadid(filename):
      assert(filename.endswith('.html'))
      return int(filename[0:len(filename) - len('.html')])
   synch.import_nested_threads_dir('prog', 'threads', filename_to_threadid)
   synch.commit()
   #printDB(db)
   dbconn.close()

def demo1():
   conn_gen = lambda: http.HTTPSConnection('dis.4chan.org', timeout=20)
   scraper = ProgScraper(conn_gen)
   parser = ProgParser()
   dbname = 'prog.db'
   dbconn = sq.connect(dbname)
   db = ProgDB(dbconn)
   db.create()
   synch = ProgSynch(scraper, parser, db)
   synch.synch_top_list_pages('prog', 5)
   synch.commit()
   #printDB(db)
   dbconn.close()

# sync all of /prog/
def demo2():
   conn_gen = lambda: http.HTTPSConnection('dis.4chan.org', timeout=20)
   scraper = ProgScraper(conn_gen)
   parser = ProgParser()
   dbname = 'prog.db'
   dbconn = sq.connect(dbname)
   db = ProgDB(dbconn)
   db.create()
   synch = ProgSynch(scraper, parser, db)
   synch.synch_subject_page('prog')
   synch.commit()
   #printDB(db)
   dbconn.close()

# sync from dummy subject but then stop after 30 seconds.
def async_test1():
   scraper = DummyScraper()
   parser = ProgParser()
   dbname = 'prog.db'
   dbconn = sq.connect(dbname, check_same_thread=False)
   db = ProgDB(dbconn)
   db.create()
   synch = ProgSynch(scraper, parser, db)
   asynch = ProgAsynch(synch)

   def command(synch):
      synch.synch_subject_page('prog')
      synch.commit()

   asynch.start_command(command)
   tm.sleep(30)
   asynch.abort()
   asynch.wait()

   #printDB(db)
   dbconn.close()