Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/local/bin/python
- import os
- import sys
- import time
- import random
- import datetime
- import re
- import urllib.request, urllib.error, urllib.parse
- import sqlite3
- import hashlib
- from bs4 import BeautifulSoup
- def main():
- url = "https://www.mynavyhr.navy.mil/References/Messages/ALNAV-2021/"
- db = "alnavs.db"
- newitems = get_current_items_list(url)
- updatedb(db, newitems)
- allitems = get_items(db)
- print_items(allitems)
- #######################################################################
- ### GET THE CURRENT SET OF ALNAVS ###
- #######################################################################
- def get_current_items_list(url):
- # Get the ALNAVS page and return the ALNAVS as a list of dictionaries.
- pagetext = get_response(url)
- page = BeautifulSoup(pagetext, 'html.parser')
- # Manually determined the listings are in the only table on the
- # page with no border. Eventually should write a function to
- # heuristically determine the listing for automagic RSS extraction.
- # Would probably look for a series of <li> or <tr> or <p> that have
- # <a> elements with similar URLs in the href attribute and no nesting
- # (in order to rule out menu systems).
- trs = page.find("table", attrs={"border": "0"}).tbody.find_all("tr")
- items = []
- for tr in trs:
- tds = tr.find_all("td")
- a = tds[1].find("a")
- if a is not None:
- thisitem = {
- "number": filternonascii(tds[0].text.strip()),
- "title": filternonascii(tds[1].text.strip()),
- "href": "https://www.mynavyhr.navy.mil" + a.get('href'),
- "desc": "",
- "pubdate": ""
- }
- items.append(thisitem)
- return items
- #######################################################################
- ### GET AN HTTP RESPONSE FROM A URL REQUEST ###
- #######################################################################
- def get_response(url):
- userAgent = get_random_user_agent()
- request = urllib.request.Request(url)
- request.add_header('User-agent', userAgent)
- return urllib.request.urlopen(request).read()
- #######################################################################
- ### MAKES PLAIN TEXT SUITABLE FOR DISPLAY AS HTML ###
- #######################################################################
- def filternonascii(text):
- return "".join(filter(lambda x: ord(x)<128, text))
- #######################################################################
- ### SEARCHES AN ALNAV FOR A NAVY DTG AND CREATES A PUBDATE ###
- #######################################################################
- def updatedb(dbfilename, newitems):
- # If the database doesn't exist, create it with the right schema
- if (not os.path.isfile(dbfilename)):
- open(dbfilename, 'w+').close()
- con = sqlite3.connect(dbfilename)
- cur = con.cursor()
- cur.execute("create table listings (number varchar(10), title varchar(255), href varchar(255), desc text, pubdate varchar(255));")
- con.close()
- # Get all the existing ALNAV entries from the database
- storeditems = get_items(dbfilename)
- # For each entry already in the database, remove it from the list to update
- for storeditem in storeditems:
- for newitem in newitems:
- if storeditem['number'] == newitem['number']:
- newitems.remove(newitem)
- # For each remaining entry to update, get the full description and
- # then pull the publication date from the text
- for newitem in newitems:
- newitem['desc'] = get_response(newitem['href'])
- newitem['desc'] = make_html_safe(newitem['desc'])
- newitem['pubdate'] = get_pubdate_from_alnav_text(newitem['desc'])
- con = sqlite3.connect(dbfilename)
- cur = con.cursor()
- # Insert the new values into the database.
- for newitem in newitems:
- cur.execute('INSERT INTO listings (number, title, href, desc, pubdate) VALUES (?, ?, ?, ?, ?)', (newitem['number'], newitem['title'], newitem['href'], newitem['desc'], newitem['pubdate']))
- con.commit()
- con.close()
- #######################################################################
- ### SEARCHES AN ALNAV FOR A NAVY DTG AND CREATES A PUBDATE ###
- #######################################################################
- def get_items(dbfilename):
- con = sqlite3.connect(dbfilename)
- con.row_factory = sqlite3.Row
- cur = con.cursor()
- cur.execute('SELECT * FROM listings ORDER BY number DESC')
- storeditems = [dict(row) for row in cur.fetchall()]
- con.close()
- return storeditems
- #######################################################################
- ### SEARCHES AN ALNAV FOR A NAVY DTG AND CREATES A PUBDATE ###
- #######################################################################
- def get_pubdate_from_alnav_text(text):
- # ALNAV dates are in format `R 302051Z APR 21`
- # Or in other words "R " + "%d%H%MZ %b %y"
- dtgregex = r"R [0-9][0-9][0-9][0-9][0-9][0-9]Z [A-Z][A-Z][A-Z] [0-9][0-9]"
- pubdate = re.search(dtgregex, str(text)).group(0) + " +0000"
- pubdate = datetime.datetime.strptime(str(pubdate), "R %d%H%MZ %b %y %z")
- pubdate = pubdate.strftime("%a, %d %b %Y %H:%M:%S %Z")
- return pubdate
- #######################################################################
- ### MAKES PLAIN TEXT SUITABLE FOR DISPLAY AS HTML ###
- #######################################################################
- def make_html_safe(text):
- new = text
- new = new.decode('8859')
- new = new.replace("&", "&")
- new = new.replace("<", "<")
- new = new.replace(">", ">")
- new = new.replace(" ", " ")
- new = new.replace("\r\n", "<br />\n")
- new = new.strip()
- return new
- #######################################################################
- ### GENERATES A RANDOM USER AGENT TO USE IN OUR REQUEST ###
- #######################################################################
- def get_random_user_agent():
- randomint = random.randint(0,18)
- user_agents = [
- 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
- 'Opera/9.25 (Windows NT 5.1; U; en)',
- 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
- 'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
- 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.142 Safari/535.19',
- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:11.0) Gecko/20100101 Firefox/11.0',
- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:8.0.1) Gecko/20100101 Firefox/8.0.1',
- 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.151 Safari/535.19',
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:53.0) Gecko/20100101 Firefox/53.0',
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393',
- 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)',
- 'Mozilla/5.0 (Windows; U; MSIE 7.0; Windows NT 6.0; en-US)',
- 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)',
- 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0; Trident/5.0)',
- 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0; MDDCJS)',
- 'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
- 'Mozilla/5.0 (iPad; CPU OS 8_4_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12H321 Safari/600.1.4',
- 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1',
- 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
- ]
- return user_agents[randomint]
- #######################################################################
- ### PRINT OUT A SET OF ALNAVS AS RSS XML ###
- #######################################################################
- def print_items(items):
- print("Content-type: text/xml")
- print()
- print("<?xml version=\"1.0\"?>")
- print("<rss version=\"2.0\">")
- print(" <channel>")
- print(" <title>ALNAVS 2021</title>")
- print(" <link>https://www.mynavyhr.navy.mil/References/Messages/ALNAV-2021/</link>")
- print(" <description>Custom RSS XML for All-Navy (ALNAV) Messages for CY 2021</description>")
- for item in items:
- print( " <item>" )
- print( " <title>{}: {}</title>".format(item['number'], item['title']) )
- print( " <link>{}</link>".format(item['href']) )
- print( " <description><![CDATA[{}]]></description>".format(item['desc']) )
- print( " <pubDate>{}</pubDate>".format(item['pubdate']) )
- print( " </item>" )
- print(" </channel>")
- print("</rss>")
- #######################################################################
- ### MAIN FUNCTION LOOPBACK ###
- #######################################################################
- if __name__ == "__main__":
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement