Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import json
- import re
- import webapp2
- from bs4 import BeautifulSoup
- from datetime import datetime
- import httplib
- import logging
- from google.appengine.ext import ndb
- from google.appengine.api import mail
- class VBStatus(ndb.Model):
- year = ndb.IntegerProperty()
- month = ndb.IntegerProperty()
- status = ndb.BooleanProperty()
- class VBHandler(webapp2.RequestHandler):
- @staticmethod
- def get_month(month):
- options = {
- 1: "january",
- 2: "february",
- 3: "march",
- 4: "april",
- 5: "may",
- 6: "june",
- 7: "july",
- 8: "august",
- 9: "september",
- 10: "october",
- 11: "november",
- 12: "december",
- }
- return options[month]
- @staticmethod
- def get_urls(year, month):
- fiscal_year = (year + 1) if month >= 10 else year
- return [
- "/content/visas/english/law-and-policy/bulletin/" +\
- "{fiscal_year}/visa-bulletin-for-{month}-{year}.html".format(
- fiscal_year=fiscal_year,
- year=year,
- month=VBHandler.get_month(month)
- ),
- "/content/visas/english/law-and-policy/bulletin/" +\
- "{fiscal_year}/visa-bulletin-{month}-{year}.html".format(
- fiscal_year=fiscal_year,
- year=year,
- month=VBHandler.get_month(month)
- ),
- ]
- @staticmethod
- def parse(soup, table_name):
- row = 0
- column = 0
- result = {}
- visa_type = None
- countries = []
- classes = []
- for tr in soup.find("div", {"class": table_name}).find("table").find("tbody").findAll("tr"):
- for td in tr.findAll("td"):
- content = ''.join(td.findAll(text=True)).strip()
- if row == 0 and column == 0:
- visa_type = VBHandler.normalize_visa_type(content)
- elif column == 0:
- visa_class = VBHandler.normalize_visa_class(visa_type, content)
- classes.append(visa_class)
- result[visa_class] = {}
- elif row == 0:
- countries.append(VBHandler.normalize_country(content))
- else:
- date_str = content
- visa_date = ''
- if date_str != "C":
- date = datetime.strptime(date_str, "%d%b%y")
- visa_date = date.strftime('%Y-%m-%d')
- result[classes[row-1]][countries[column-1]] = visa_date
- column += 1
- row += 1
- column = 0
- return {'type': visa_type, 'classes': result}
- @staticmethod
- def normalize_visa_type(visa_type):
- visa_type = visa_type.lower()
- if re.search("family", visa_type):
- return "family"
- if re.search("employment", visa_type):
- return "employment"
- @staticmethod
- def normalize_visa_class(visa_type, visa_class):
- visa_class = visa_class.lower()
- if visa_type == "employment":
- if re.search("1st", visa_class):
- return "eb1"
- if re.search("2nd", visa_class):
- return "eb2"
- if re.search("3rd", visa_class):
- return "eb3"
- if re.search('4th', visa_class):
- return 'eb4'
- if re.search('5th', visa_class):
- return 'eb5'
- if re.search('other', visa_class):
- return 'eb_ow'
- if re.search('religious', visa_class):
- return 'eb_crw'
- if visa_type == 'family':
- return visa_class
- @staticmethod
- def normalize_country(country):
- country = country.lower()
- if re.search("chargeability", country):
- return "row"
- if re.search("china", country):
- return "china"
- if re.search("india", country):
- return "india"
- if re.search("mexico", country):
- return "mexico"
- if re.search("philippines", country):
- return "philippines"
- return country
- def upload_to_parse(self, year, month, vb_data, full_url, force):
- try:
- param_data = {
- "year": year,
- "month": month,
- "full_url": full_url,
- "data": vb_data,
- "force": force,
- }
- connection = httplib.HTTPSConnection('api.parse.com', 443)
- connection.connect()
- connection.request(
- 'POST',
- '/1/functions/updateVisaBulletin',
- json.dumps(param_data),
- {
- "X-Parse-Application-Id": "YOUR_APP_ID",
- "X-Parse-REST-API-Key": "YOUR_API_KEY",
- "Content-Type": "application/json"
- }
- )
- return json.loads(connection.getresponse().read())
- except:
- return {
- "result": {
- "error": "fail"
- }
- }
- def get_vb(self, year, month, vb_status, force):
- self.response.content_type = 'application/json'
- urls = VBHandler.get_urls(year, month)
- resp = None
- full_url = None
- for url in urls:
- try:
- headers = {
- 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36',
- }
- conn = httplib.HTTPConnection("travel.state.gov")
- conn.request("GET", url, None, headers)
- resp = conn.getresponse()
- if not resp or resp.status != 200:
- logging.warning("VB not found {year}-{month} @ {url}".format(year=year, month=month, url=url))
- # try next url
- continue
- full_url = "http://travel.state.gov" + url
- break
- except:
- # try next url
- continue
- if not resp or resp.status != 200:
- self.response.write(json.dumps({
- "result": "not found",
- }))
- return
- data = resp.read()
- soup = BeautifulSoup(data)
- family_based = VBHandler.parse(soup, "visabulletinmaintable")
- employment_based = VBHandler.parse(soup, "visabulletinemploymenttable")
- vb_data = {
- family_based['type']: family_based['classes'],
- employment_based['type']: employment_based['classes'],
- }
- upload_result = self.upload_to_parse(year, month, vb_data, full_url, force)
- subject = None
- if 'id' in upload_result['result']:
- subject = "VB Uploaded {year}-{month}: {result}".format(year=year, month=month, result=json.dumps(upload_result))
- logging.info(subject)
- vb_status.status = True
- else:
- subject = "Failed to upload {year}-{month}: {result}".format(year=year, month=month, result=json.dumps(upload_result))
- logging.error(subject)
- vb_status.status = False
- vb_status.put()
- sender_address = "YOUR_SENDER_EMAIL"
- body = json.dumps({
- "year": year,
- "month": month,
- "full_url": full_url,
- "data": vb_data,
- "upload_result": upload_result,
- })
- mail.send_mail(sender_address, "YOUR_EMAIL", subject, body)
- self.response.write(body)
- def get(self):
- force = bool(self.request.get('force'))
- if not self.request.get('year') or not self.request.get('month'):
- now = datetime.now()
- year = int(now.strftime("%Y"))
- month = int(now.strftime("%m")) + 1 # check next month
- if month == 13: # convert to January
- year += 1
- month = 1
- else:
- year = int(self.request.get('year'))
- month = int(self.request.get('month'))
- query = VBStatus.query(VBStatus.year == year, VBStatus.month == month).fetch(1)
- vb_status = None
- for result in query:
- vb_status = result
- if vb_status.status and not force:
- logging.info("Already uploaded {year}-{month}".format(year=year, month=month))
- self.response.write("Already uploaded {year}-{month}".format(year=year, month=month))
- return
- if vb_status is None:
- vb_status = VBStatus(parent=ndb.Key('VB', 'whatever'))
- vb_status.year = year
- vb_status.month = month
- vb_status.status = False
- self.get_vb(year, month, vb_status, force)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement