Advertisement
Guest User

Untitled

a guest
May 19th, 2021
43
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 9.89 KB | None | 0 0
  1. #!/usr/local/bin/python
  2.  
  3. import os
  4. import sys
  5. import time
  6. import random
  7. import datetime
  8. import re
  9. import urllib.request, urllib.error, urllib.parse
  10. import sqlite3
  11. import hashlib
  12. from bs4 import BeautifulSoup
  13.  
  14.  
  15.  
  16.  
  17.  
  18.  
  19.  
  20.  
  21.  
  22.  
  23. def main():
  24.     url = "https://www.mynavyhr.navy.mil/References/Messages/ALNAV-2021/"
  25.     db  = "alnavs.db"
  26.     newitems = get_current_items_list(url)
  27.     updatedb(db, newitems)
  28.     allitems = get_items(db)
  29.     print_items(allitems)
  30.  
  31.  
  32.  
  33.  
  34.  
  35.  
  36.  
  37.  
  38.  
  39.  
  40. #######################################################################
  41. ###                  GET THE CURRENT SET OF ALNAVS                  ###
  42. #######################################################################
  43. def get_current_items_list(url):
  44.     # Get the ALNAVS page and return the ALNAVS as a list of dictionaries.
  45.     pagetext = get_response(url)
  46.     page = BeautifulSoup(pagetext, 'html.parser')
  47.  
  48.     # Manually determined the listings are in the only table on the
  49.     # page with no border. Eventually should write a function to
  50.     # heuristically determine the listing for automagic RSS extraction.
  51.     # Would probably look for a series of <li> or <tr> or <p> that have
  52.     # <a> elements with similar URLs in the href attribute and no nesting
  53.     # (in order to rule out menu systems).
  54.     trs = page.find("table", attrs={"border": "0"}).tbody.find_all("tr")
  55.  
  56.     items = []
  57.     for tr in trs:
  58.         tds = tr.find_all("td")
  59.         a = tds[1].find("a")
  60.         if a is not None:
  61.             thisitem = {
  62.                 "number": filternonascii(tds[0].text.strip()),
  63.                 "title": filternonascii(tds[1].text.strip()),
  64.                 "href": "https://www.mynavyhr.navy.mil" + a.get('href'),
  65.                 "desc": "",
  66.                 "pubdate": ""
  67.             }
  68.             items.append(thisitem)
  69.  
  70.     return items
  71.  
  72.  
  73.  
  74.  
  75.  
  76.  
  77.  
  78.  
  79.  
  80.  
  81. #######################################################################
  82. ###             GET AN HTTP RESPONSE FROM A URL REQUEST             ###
  83. #######################################################################
  84. def get_response(url):
  85.     userAgent = get_random_user_agent()
  86.     request = urllib.request.Request(url)
  87.     request.add_header('User-agent', userAgent)
  88.     return urllib.request.urlopen(request).read()
  89.  
  90.  
  91.  
  92.  
  93.  
  94.  
  95.  
  96.  
  97.  
  98.  
  99. #######################################################################
  100. ###          MAKES PLAIN TEXT SUITABLE FOR DISPLAY AS HTML          ###
  101. #######################################################################
  102. def filternonascii(text):
  103.     return "".join(filter(lambda x: ord(x)<128, text))
  104.  
  105.  
  106.  
  107.  
  108.  
  109.  
  110.  
  111.  
  112.  
  113. #######################################################################
  114. ###      SEARCHES AN ALNAV FOR A NAVY DTG AND CREATES A PUBDATE     ###
  115. #######################################################################
  116. def updatedb(dbfilename, newitems):
  117.     # If the database doesn't exist, create it with the right schema
  118.     if (not os.path.isfile(dbfilename)):
  119.         open(dbfilename, 'w+').close()
  120.         con = sqlite3.connect(dbfilename)
  121.         cur = con.cursor()
  122.         cur.execute("create table listings (number varchar(10), title varchar(255), href varchar(255), desc text, pubdate varchar(255));")
  123.         con.close()
  124.  
  125.     # Get all the existing ALNAV entries from the database
  126.     storeditems = get_items(dbfilename)
  127.  
  128.     # For each entry already in the database, remove it from the list to update
  129.     for storeditem in storeditems:
  130.         for newitem in newitems:
  131.             if storeditem['number'] == newitem['number']:
  132.                 newitems.remove(newitem)
  133.  
  134.     # For each remaining entry to update, get the full description and
  135.     # then pull the publication date from the text
  136.     for newitem in newitems:
  137.         newitem['desc'] = get_response(newitem['href'])
  138.         newitem['desc'] = make_html_safe(newitem['desc'])
  139.         newitem['pubdate'] = get_pubdate_from_alnav_text(newitem['desc'])
  140.  
  141.     con = sqlite3.connect(dbfilename)
  142.     cur = con.cursor()
  143.  
  144.     # Insert the new values into the database.
  145.     for newitem in newitems:
  146.         cur.execute('INSERT INTO listings (number, title, href, desc, pubdate) VALUES (?, ?, ?, ?, ?)', (newitem['number'], newitem['title'], newitem['href'], newitem['desc'], newitem['pubdate']))
  147.  
  148.     con.commit()
  149.     con.close()
  150.  
  151.  
  152.  
  153.  
  154.  
  155.  
  156.  
  157.  
  158.  
  159.  
  160. #######################################################################
  161. ###      SEARCHES AN ALNAV FOR A NAVY DTG AND CREATES A PUBDATE     ###
  162. #######################################################################
  163. def get_items(dbfilename):
  164.     con = sqlite3.connect(dbfilename)
  165.     con.row_factory = sqlite3.Row
  166.     cur = con.cursor()
  167.     cur.execute('SELECT * FROM listings ORDER BY number DESC')
  168.     storeditems = [dict(row) for row in cur.fetchall()]
  169.     con.close()
  170.     return storeditems
  171.  
  172.  
  173.  
  174.  
  175.  
  176.  
  177.  
  178.  
  179. #######################################################################
  180. ###      SEARCHES AN ALNAV FOR A NAVY DTG AND CREATES A PUBDATE     ###
  181. #######################################################################
  182. def get_pubdate_from_alnav_text(text):
  183.     # ALNAV dates are in format `R 302051Z APR 21`
  184.     # Or in other words "R " + "%d%H%MZ %b %y"
  185.     dtgregex = r"R [0-9][0-9][0-9][0-9][0-9][0-9]Z [A-Z][A-Z][A-Z] [0-9][0-9]"
  186.     pubdate = re.search(dtgregex, str(text)).group(0) + " +0000"
  187.     pubdate = datetime.datetime.strptime(str(pubdate), "R %d%H%MZ %b %y %z")
  188.     pubdate = pubdate.strftime("%a, %d %b %Y %H:%M:%S %Z")
  189.     return pubdate
  190.  
  191.  
  192.  
  193.  
  194.  
  195.  
  196.  
  197.  
  198.  
  199.  
  200. #######################################################################
  201. ###          MAKES PLAIN TEXT SUITABLE FOR DISPLAY AS HTML          ###
  202. #######################################################################
  203. def make_html_safe(text):
  204.     new = text
  205.     new = new.decode('8859')
  206.     new = new.replace("&", "&amp;")
  207.     new = new.replace("<", "&lt;")
  208.     new = new.replace(">", "&gt;")
  209.     new = new.replace("  ", " &nbsp;")
  210.     new = new.replace("\r\n", "<br />\n")
  211.     new = new.strip()
  212.     return new
  213.  
  214.  
  215.  
  216.  
  217.  
  218.  
  219.  
  220.  
  221.  
  222.  
  223. #######################################################################
  224. ###       GENERATES A RANDOM USER AGENT TO USE IN OUR REQUEST       ###
  225. #######################################################################
  226. def get_random_user_agent():
  227.     randomint = random.randint(0,18)
  228.     user_agents = [
  229.         'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
  230.         'Opera/9.25 (Windows NT 5.1; U; en)',
  231.         'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
  232.         'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
  233.         'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.142 Safari/535.19',
  234.         'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:11.0) Gecko/20100101 Firefox/11.0',
  235.         'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:8.0.1) Gecko/20100101 Firefox/8.0.1',
  236.         'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.151 Safari/535.19',
  237.         'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
  238.         'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:53.0) Gecko/20100101 Firefox/53.0',
  239.         'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393',
  240.         'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)',
  241.         'Mozilla/5.0 (Windows; U; MSIE 7.0; Windows NT 6.0; en-US)',
  242.         'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)',
  243.         'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0;  Trident/5.0)',
  244.         'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0; MDDCJS)',
  245.         'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
  246.         'Mozilla/5.0 (iPad; CPU OS 8_4_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12H321 Safari/600.1.4',
  247.         'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1',
  248.         'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
  249.     ]
  250.  
  251.     return user_agents[randomint]
  252.  
  253.  
  254.  
  255.  
  256.  
  257.  
  258.  
  259.  
  260.  
  261.  
  262. #######################################################################
  263. ###               PRINT OUT A SET OF ALNAVS AS RSS XML              ###
  264. #######################################################################
  265. def print_items(items):
  266.     print("Content-type: text/xml")
  267.     print()
  268.     print("<?xml version=\"1.0\"?>")
  269.     print("<rss version=\"2.0\">")
  270.     print("  <channel>")
  271.     print("    <title>ALNAVS 2021</title>")
  272.     print("    <link>https://www.mynavyhr.navy.mil/References/Messages/ALNAV-2021/</link>")
  273.     print("    <description>Custom RSS XML for All-Navy (ALNAV) Messages for CY 2021</description>")
  274.  
  275.     for item in items:
  276.         print( "    <item>"                                                           )
  277.         print( "      <title>{}: {}</title>".format(item['number'], item['title'])    )
  278.         print( "      <link>{}</link>".format(item['href'])                           )
  279.         print( "      <description><![CDATA[{}]]></description>".format(item['desc']) )
  280.         print( "      <pubDate>{}</pubDate>".format(item['pubdate'])                  )
  281.         print( "    </item>"                                                          )
  282.  
  283.     print("  </channel>")
  284.     print("</rss>")
  285.  
  286.  
  287.  
  288.  
  289.  
  290.  
  291.  
  292.  
  293.  
  294.  
  295. #######################################################################
  296. ###                      MAIN FUNCTION LOOPBACK                     ###
  297. #######################################################################
  298. if __name__ == "__main__":
  299.     main()
  300.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement