Advertisement
Guest User

Untitled

a guest
Mar 30th, 2015
243
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.89 KB | None | 0 0
  1. from lxml import html
  2. import requests
  3. import simplekml
  4. import re
  5.  
  6. # base url for pdga site
  7. baseURL = r'http://www.pdga.com'
  8. # search url for all USA courses
  9. searchURL = r'/course-directory/advanced?title=&field_course_location_country=US&field_course_location_locality=&field_course_location_administrative_area=All&field_course_location_postal_code=&rating_value=All&field_course_holes_value=All&field_course_total_length_value=All&field_course_target_type_value=All&field_course_tee_type_value=All&field_course_camping_value=All&field_course_facilities_value=All&field_course_fees_value=All&field_course_handicap_value=All&field_course_private_value=All&field_course_signage_value=All&page='
  10.  
  11. # xpaths for extracting data
  12. xpaths = {
  13.    'Name': r'//td[@class="views-field views-field-title"]/a/text()',
  14.    'City': r'//td[@class="views-field views-field-field-course-location"]/text()',
  15.    'State': r'//td[@class="views-field views-field-field-course-location-2"]/span/text()',
  16.    'Holes': r'//td[@class="views-field views-field-field-course-holes"]/text()',
  17.    'link': r'//td[@class="views-field views-field-title"]/a/@href',
  18.    'Address': r'//div[@class="street-block"]/div/text()',
  19.    'desc': r'//*[@id="course"]/div[1]/div[1]/div[1]/div/div[7]/div/div/div/div/text()',
  20.    'details': r'//*[@id="quicktabs-tabpage-course_node-0"]/div/div/div',
  21.    }
  22.  
  23. # Data found on the search result page.  Each course will have its own element in each list
  24. searchPageData = {
  25.    'Name': [],
  26.    'City': [],
  27.    'State': [],
  28.    'Holes': [],
  29.    'link': [],
  30.    }
  31.  
  32. # Object to represent a single course
  33. class Course( object ):
  34.    Name = ''
  35.    City = ''
  36.    State = ''
  37.    Holes = ''
  38.    Address = ''
  39.    description = ''
  40.    details = ''
  41.  
  42.    def __init__( self, Name, City, State, Holes, link ):
  43.       self.Name = Name
  44.       self.City = City
  45.       self.State = State
  46.       self.Holes = Holes
  47.       self.link = link
  48.  
  49.       # make request to the course's specific page
  50.       self.subpage = requests.get( baseURL + self.link )
  51.       self.tree = html.fromstring( self.subpage.text )
  52.  
  53.       self.fillSubPageData( )
  54.  
  55.    def fillSubPageData( self ):
  56.       temp = self.tree.xpath( xpaths[ 'Address' ] )
  57.       # protect against the Address not existing; just use the name
  58.       if temp == []:
  59.          temp = [self.Name]
  60.       self.Address = temp[0] + ', ' + self.City + ', ' + self.State
  61.       temp = self.tree.xpath( xpaths[ 'desc' ] )
  62.       # protect against the description not existing; just use the name
  63.       if temp == []:
  64.          temp = [self.Name]
  65.       self.description = temp[0]
  66.  
  67.       temp = self.tree.xpath( xpaths[ 'details' ] )
  68.       detailsStr = ''
  69.       if temp == []:
  70.          detailsStr = self.description
  71.       else:
  72.          for child in temp[0].getchildren( ):
  73.             detailsStr = detailsStr + '%s\n' % child.text_content( )
  74.       self.details = detailsStr
  75.  
  76. def addPoint( kml, course ):
  77.    # create a new point in the KML object
  78.    pnt = kml.newpoint(name=course.Name)
  79.  
  80.    # update address
  81.    pnt.address = sanitizeStr( course.Address )
  82.  
  83.    # regex to pull the coordinates out of the course page's javascript code
  84.    m = re.search(r'"coordinates":\[(-*\d+\.\d+),(-*\d+\.\d+)\]', course.subpage.text)
  85.    pnt.coords = [(float(m.groups()[0]), float(m.groups()[1]))]
  86.  
  87.    # update description
  88.    pnt.description = sanitizeStr( 'Holes: %s\n\n' % course.Holes + course.description + '\n\n' + course.details )
  89.  
  90. def sanitizeStr( inputStr ):
  91.    return inputStr.replace( '&', '&' )
  92.  
  93. def scrape( numPages ):
  94.  
  95.    # initialize the KML object
  96.    kml = simplekml.Kml()
  97.  
  98.    # Loop through each page of the search results
  99.    for pageIdx in range( 0, numPages ):
  100.       print 'Page: %d' % pageIdx
  101.  
  102.       # grab a page of search results
  103.       page = requests.get(baseURL + searchURL + '%d' % pageIdx)
  104.       tree = html.fromstring(page.text)
  105.  
  106.       # grab data for each course listed
  107.       for key in searchPageData.keys( ):
  108.          searchPageData[key] = tree.xpath( xpaths[key] )
  109.          for d in range(len(searchPageData[key])):
  110.             # remove beginning and trailing whitespace
  111.             searchPageData[key][d] = searchPageData[key][d].strip(  )
  112.  
  113.       # loop through each course on this page
  114.       for courseIdx in range( 0, len( searchPageData['Name'] ) ):
  115.          print searchPageData['Name'][ courseIdx ]
  116.          try:
  117.             # create a Course instance
  118.             course = Course( searchPageData['Name'][ courseIdx ], searchPageData['City'][ courseIdx ], searchPageData['State'][ courseIdx ], searchPageData['Holes'][ courseIdx ], searchPageData['link'][ courseIdx ] )
  119.             # add the course data to the kml object
  120.             addPoint( kml, course )
  121.          except Exception as theE:
  122.             print 'Error with: %s' % searchPageData['Name'][ courseIdx ]
  123.             print theE.message
  124.  
  125.    # save the KML data
  126.    kml.save( r'C:\kml\PDGA_Course_List_USA.kml' )
  127.  
  128.    return
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement