Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import email
- import getpass, imaplib
- import os
- import sys
- import re
- import sqlite3
- from datetime import datetime
- from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
- from pdfminer.converter import TextConverter
- from pdfminer.layout import LAParams
- from pdfminer.pdfpage import PDFPage
- from cStringIO import StringIO
- sqlite_db = 'thames_flow.sqlite'
- detach_dir = '.'
- db_dir = '.'
- #userName = raw_input('Enter your GMail username:')
- userName = 'flowdata@richmondcanoeclub.com'
- #passwd = getpass.getpass('Enter your password: ')
- passwd = ''
- def download_attachments():
- imapSession = imaplib.IMAP4_SSL('mail.richmondcanoeclub.com')
- typ, accountDetails = imapSession.login(userName, passwd)
- if typ != 'OK':
- print 'Not able to sign in!'
- raise
- imapSession.select('INBOX')
- #typ, data = imapSession.search(None, 'ALL')
- typ, data = imapSession.search(None, '(UNSEEN)')
- if typ != 'OK':
- print 'Error searching Inbox.'
- raise
- # Iterating over all emails
- for msgNum in data[0].split():
- typ, messageParts = imapSession.fetch(msgNum, '(RFC822)')
- if typ != 'OK':
- print 'Error fetching mail.'
- raise
- emailBody = messageParts[0][1]
- mail = email.message_from_string(emailBody)
- #print '--- Message %s ---' % (msgNum)
- #for (k, v) in mail.items():
- # print '%s: %s' % (k, v)
- msgId = mail.get('Message-ID').strip('<>')
- for part in mail.walk():
- if part.get_content_maintype() == 'multipart':
- # print part.as_string()
- continue
- if part.get('Content-Disposition') is None:
- # print part.as_string()
- continue
- fileName = part.get_filename()
- if bool(fileName) and re.match('Report \\w+.pdf', fileName):
- dirPath = os.path.join(detach_dir, 'attachments', msgId)
- if not os.path.isdir(dirPath) :
- os.mkdir(dirPath)
- filePath = os.path.join(detach_dir, 'attachments', msgId, fileName)
- if not os.path.isfile(filePath) :
- print '%s/%s' % (msgId, fileName)
- fp = open(filePath, 'wb')
- fp.write(part.get_payload(decode=True))
- fp.close()
- imapSession.close()
- imapSession.logout()
- def convert_pdf_to_txt(path):
- rsrcmgr = PDFResourceManager()
- retstr = StringIO()
- codec = 'utf-8'
- laparams = LAParams()
- device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
- fp = file(path, 'rb')
- interpreter = PDFPageInterpreter(rsrcmgr, device)
- password = ""
- maxpages = 0
- caching = True
- pagenos=set()
- for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
- interpreter.process_page(page)
- fp.close()
- device.close()
- str = retstr.getvalue()
- retstr.close()
- return str
- def get_pdf_lines(path):
- lines = []
- for line in convert_pdf_to_txt(path).splitlines():
- if len(line.strip()) > 0:
- lines.append(line.strip())
- return lines
- def parse_flowdata(lines):
- stations = []
- values = []
- time = ''
- date = ''
- for line in lines:
- if line.endswith(' Flow'):
- stations.append(line)
- elif line.endswith(' m3/s'):
- values.append(float(line.replace(' m3/s', '')))
- elif re.match('\\d{2}/\\d{2}/\\d{4}', line):
- date = line
- elif re.match('\\d{2}:\\d{2}:\\d{2}', line):
- time = line
- return [datetime.strptime('%s %s' % (date, time), '%d/%m/%Y %H:%M:%S').strftime('%Y-%m-%d %H:%M:%S'), zip(stations, values)]
- def parse_temperature(lines):
- stations = []
- values = []
- times = []
- for line in lines:
- if line.endswith(' Lock'):
- stations.append(line)
- elif line.endswith(' deg C'):
- values.append(float(line.replace(' deg C', '')))
- elif re.match('\\d{2}/\\d{2}/\\d{4} +\\d{1}:\\d{2}', line):
- times.append(datetime.strptime(re.sub(' +', ' ', re.sub(' (\\d:)', ' 0\\1', line)), '%m/%d/%Y %H:%M').strftime('%Y-%m-%d %H:%M:%S'))
- return zip(times, stations, values)
- def store_flowdata(data):
- time = data[0]
- items = data[1]
- conn = sqlite3.connect(os.path.join(db_dir, sqlite_db))
- c = conn.cursor()
- c.execute('CREATE TABLE IF NOT EXISTS flowrate (measured_at TEXT NOT NULL, station_name TEXT NOT NULL, value REAL NOT NULL, PRIMARY KEY (measured_at, station_name))')
- print [[time, d[0], d[1]] for d in items]
- c.executemany('INSERT INTO flowrate VALUES (?,?,?)', [[time, d[0], d[1]] for d in items])
- conn.commit()
- conn.close()
- def store_temperature(data):
- conn = sqlite3.connect(os.path.join(db_dir, sqlite_db))
- c = conn.cursor()
- c.execute('CREATE TABLE IF NOT EXISTS temperature (measured_at TEXT NOT NULL, station_name TEXT NOT NULL, value REAL NOT NULL, PRIMARY KEY (measured_at, station_name))')
- print data
- c.executemany('INSERT INTO temperature VALUES (?,?,?)', data)
- conn.commit()
- conn.close()
- def process_attachments():
- for d in os.listdir(os.path.join(detach_dir, 'attachments')):
- dirPath = os.path.join(detach_dir, 'attachments', d)
- if os.path.isdir(dirPath):
- for f in os.listdir(dirPath):
- if f.endswith('.pdf'):
- #print d
- filePath = os.path.join(detach_dir, 'attachments', d, f)
- lines = get_pdf_lines(filePath)
- if len(lines) > 0 and lines[0] == 'River Thames Flows':
- print '---------------------'
- data = parse_flowdata(lines)
- store_flowdata(data)
- elif len(lines) > 1 and lines[1] == 'Water Temperature':
- print '---------------------'
- data = parse_temperature(lines)
- store_temperature(data)
- # Remove the file now we are done with it
- os.remove(filePath)
- if len(os.listdir(dirPath)) == 0: # Remove the parent directory if all attachments have been removed
- os.rmdir(dirPath)
- download_attachments()
- process_attachments()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement