Untitled

#!/usr/local/bin/python

import os
import sys
import time
import random
import datetime
import re
import urllib.request, urllib.error, urllib.parse
import sqlite3
import hashlib
from bs4 import BeautifulSoup


def main():
    url = "https://www.mynavyhr.navy.mil/References/Messages/ALNAV-2021/"
    db  = "alnavs.db"
    newitems = get_current_items_list(url)
    updatedb(db, newitems)
    allitems = get_items(db)
    print_items(allitems)


#######################################################################
###                  GET THE CURRENT SET OF ALNAVS                  ###
#######################################################################
def get_current_items_list(url):
    # Get the ALNAVS page and return the ALNAVS as a list of dictionaries.
    pagetext = get_response(url)
    page = BeautifulSoup(pagetext, 'html.parser')

    # Manually determined the listings are in the only table on the
    # page with no border. Eventually should write a function to
    # heuristically determine the listing for automagic RSS extraction.
    # Would probably look for a series of <li> or <tr> or <p> that have
    # <a> elements with similar URLs in the href attribute and no nesting
    # (in order to rule out menu systems).
    trs = page.find("table", attrs={"border": "0"}).tbody.find_all("tr")

    items = []
    for tr in trs:
        tds = tr.find_all("td")
        a = tds[1].find("a")
        if a is not None:
            thisitem = {
                "number": filternonascii(tds[0].text.strip()),
                "title": filternonascii(tds[1].text.strip()),
                "href": "https://www.mynavyhr.navy.mil" + a.get('href'),
                "desc": "",
                "pubdate": ""
            }
            items.append(thisitem)

    return items


#######################################################################
###             GET AN HTTP RESPONSE FROM A URL REQUEST             ###
#######################################################################
def get_response(url):
    userAgent = get_random_user_agent()
    request = urllib.request.Request(url)
    request.add_header('User-agent', userAgent)
    return urllib.request.urlopen(request).read()


#######################################################################
###          MAKES PLAIN TEXT SUITABLE FOR DISPLAY AS HTML          ###
#######################################################################
def filternonascii(text):
    return "".join(filter(lambda x: ord(x)<128, text))


#######################################################################
###      SEARCHES AN ALNAV FOR A NAVY DTG AND CREATES A PUBDATE     ###
#######################################################################
def updatedb(dbfilename, newitems):
    # If the database doesn't exist, create it with the right schema
    if (not os.path.isfile(dbfilename)):
        open(dbfilename, 'w+').close()
        con = sqlite3.connect(dbfilename)
        cur = con.cursor()
        cur.execute("create table listings (number varchar(10), title varchar(255), href varchar(255), desc text, pubdate varchar(255));")
        con.close()

    # Get all the existing ALNAV entries from the database
    storeditems = get_items(dbfilename)

    # For each entry already in the database, remove it from the list to update
    for storeditem in storeditems:
        for newitem in newitems:
            if storeditem['number'] == newitem['number']:
                newitems.remove(newitem)

    # For each remaining entry to update, get the full description and
    # then pull the publication date from the text
    for newitem in newitems:
        newitem['desc'] = get_response(newitem['href'])
        newitem['desc'] = make_html_safe(newitem['desc'])
        newitem['pubdate'] = get_pubdate_from_alnav_text(newitem['desc'])

    con = sqlite3.connect(dbfilename)
    cur = con.cursor()

    # Insert the new values into the database.
    for newitem in newitems:
        cur.execute('INSERT INTO listings (number, title, href, desc, pubdate) VALUES (?, ?, ?, ?, ?)', (newitem['number'], newitem['title'], newitem['href'], newitem['desc'], newitem['pubdate']))

    con.commit()
    con.close()


#######################################################################
###      SEARCHES AN ALNAV FOR A NAVY DTG AND CREATES A PUBDATE     ###
#######################################################################
def get_items(dbfilename):
    con = sqlite3.connect(dbfilename)
    con.row_factory = sqlite3.Row
    cur = con.cursor()
    cur.execute('SELECT * FROM listings ORDER BY number DESC')
    storeditems = [dict(row) for row in cur.fetchall()]
    con.close()
    return storeditems


#######################################################################
###      SEARCHES AN ALNAV FOR A NAVY DTG AND CREATES A PUBDATE     ###
#######################################################################
def get_pubdate_from_alnav_text(text):
    # ALNAV dates are in format `R 302051Z APR 21`
    # Or in other words "R " + "%d%H%MZ %b %y"
    dtgregex = r"R [0-9][0-9][0-9][0-9][0-9][0-9]Z [A-Z][A-Z][A-Z] [0-9][0-9]"
    pubdate = re.search(dtgregex, str(text)).group(0) + " +0000"
    pubdate = datetime.datetime.strptime(str(pubdate), "R %d%H%MZ %b %y %z")
    pubdate = pubdate.strftime("%a, %d %b %Y %H:%M:%S %Z")
    return pubdate


#######################################################################
###          MAKES PLAIN TEXT SUITABLE FOR DISPLAY AS HTML          ###
#######################################################################
def make_html_safe(text):
    new = text
    new = new.decode('8859')
    new = new.replace("&", "&amp;")
    new = new.replace("<", "&lt;")
    new = new.replace(">", "&gt;")
    new = new.replace("  ", " &nbsp;")
    new = new.replace("\r\n", "<br />\n")
    new = new.strip()
    return new


#######################################################################
###       GENERATES A RANDOM USER AGENT TO USE IN OUR REQUEST       ###
#######################################################################
def get_random_user_agent():
    randomint = random.randint(0,18)
    user_agents = [
        'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
        'Opera/9.25 (Windows NT 5.1; U; en)',
        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
        'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
        'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.142 Safari/535.19',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:11.0) Gecko/20100101 Firefox/11.0',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:8.0.1) Gecko/20100101 Firefox/8.0.1',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.151 Safari/535.19',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:53.0) Gecko/20100101 Firefox/53.0',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393',
        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)',
        'Mozilla/5.0 (Windows; U; MSIE 7.0; Windows NT 6.0; en-US)',
        'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)',
        'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0;  Trident/5.0)',
        'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0; MDDCJS)',
        'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
        'Mozilla/5.0 (iPad; CPU OS 8_4_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12H321 Safari/600.1.4',
        'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1',
        'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
    ]

    return user_agents[randomint]


#######################################################################
###               PRINT OUT A SET OF ALNAVS AS RSS XML              ###
#######################################################################
def print_items(items):
    print("Content-type: text/xml")
    print()
    print("<?xml version=\"1.0\"?>")
    print("<rss version=\"2.0\">")
    print("  <channel>")
    print("    <title>ALNAVS 2021</title>")
    print("    <link>https://www.mynavyhr.navy.mil/References/Messages/ALNAV-2021/</link>")
    print("    <description>Custom RSS XML for All-Navy (ALNAV) Messages for CY 2021</description>")

    for item in items:
        print( "    <item>"                                                           )
        print( "      <title>{}: {}</title>".format(item['number'], item['title'])    )
        print( "      <link>{}</link>".format(item['href'])                           )
        print( "      <description><![CDATA[{}]]></description>".format(item['desc']) )
        print( "      <pubDate>{}</pubDate>".format(item['pubdate'])                  )
        print( "    </item>"                                                          )

    print("  </channel>")
    print("</rss>")


#######################################################################
###                      MAIN FUNCTION LOOPBACK                     ###
#######################################################################
if __name__ == "__main__":
    main()