Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from lxml import html
- import requests
- import simplekml
- import re
- # base url for pdga site
- baseURL = r'http://www.pdga.com'
- # search url for all USA courses
- searchURL = r'/course-directory/advanced?title=&field_course_location_country=US&field_course_location_locality=&field_course_location_administrative_area=All&field_course_location_postal_code=&rating_value=All&field_course_holes_value=All&field_course_total_length_value=All&field_course_target_type_value=All&field_course_tee_type_value=All&field_course_camping_value=All&field_course_facilities_value=All&field_course_fees_value=All&field_course_handicap_value=All&field_course_private_value=All&field_course_signage_value=All&page='
- # xpaths for extracting data
- xpaths = {
- 'Name': r'//td[@class="views-field views-field-title"]/a/text()',
- 'City': r'//td[@class="views-field views-field-field-course-location"]/text()',
- 'State': r'//td[@class="views-field views-field-field-course-location-2"]/span/text()',
- 'Holes': r'//td[@class="views-field views-field-field-course-holes"]/text()',
- 'link': r'//td[@class="views-field views-field-title"]/a/@href',
- 'Address': r'//div[@class="street-block"]/div/text()',
- 'desc': r'//*[@id="course"]/div[1]/div[1]/div[1]/div/div[7]/div/div/div/div/text()',
- 'details': r'//*[@id="quicktabs-tabpage-course_node-0"]/div/div/div',
- }
- # Data found on the search result page. Each course will have its own element in each list
- searchPageData = {
- 'Name': [],
- 'City': [],
- 'State': [],
- 'Holes': [],
- 'link': [],
- }
- # Object to represent a single course
- class Course( object ):
- Name = ''
- City = ''
- State = ''
- Holes = ''
- Address = ''
- description = ''
- details = ''
- def __init__( self, Name, City, State, Holes, link ):
- self.Name = Name
- self.City = City
- self.State = State
- self.Holes = Holes
- self.link = link
- # make request to the course's specific page
- self.subpage = requests.get( baseURL + self.link )
- self.tree = html.fromstring( self.subpage.text )
- self.fillSubPageData( )
- def fillSubPageData( self ):
- temp = self.tree.xpath( xpaths[ 'Address' ] )
- # protect against the Address not existing; just use the name
- if temp == []:
- temp = [self.Name]
- self.Address = temp[0] + ', ' + self.City + ', ' + self.State
- temp = self.tree.xpath( xpaths[ 'desc' ] )
- # protect against the description not existing; just use the name
- if temp == []:
- temp = [self.Name]
- self.description = temp[0]
- temp = self.tree.xpath( xpaths[ 'details' ] )
- detailsStr = ''
- if temp == []:
- detailsStr = self.description
- else:
- for child in temp[0].getchildren( ):
- detailsStr = detailsStr + '%s\n' % child.text_content( )
- self.details = detailsStr
- def addPoint( kml, course ):
- # create a new point in the KML object
- pnt = kml.newpoint(name=course.Name)
- # update address
- pnt.address = sanitizeStr( course.Address )
- # regex to pull the coordinates out of the course page's javascript code
- m = re.search(r'"coordinates":\[(-*\d+\.\d+),(-*\d+\.\d+)\]', course.subpage.text)
- pnt.coords = [(float(m.groups()[0]), float(m.groups()[1]))]
- # update description
- pnt.description = sanitizeStr( 'Holes: %s\n\n' % course.Holes + course.description + '\n\n' + course.details )
- def sanitizeStr( inputStr ):
- return inputStr.replace( '&', '&' )
- def scrape( numPages ):
- # initialize the KML object
- kml = simplekml.Kml()
- # Loop through each page of the search results
- for pageIdx in range( 0, numPages ):
- print 'Page: %d' % pageIdx
- # grab a page of search results
- page = requests.get(baseURL + searchURL + '%d' % pageIdx)
- tree = html.fromstring(page.text)
- # grab data for each course listed
- for key in searchPageData.keys( ):
- searchPageData[key] = tree.xpath( xpaths[key] )
- for d in range(len(searchPageData[key])):
- # remove beginning and trailing whitespace
- searchPageData[key][d] = searchPageData[key][d].strip( )
- # loop through each course on this page
- for courseIdx in range( 0, len( searchPageData['Name'] ) ):
- print searchPageData['Name'][ courseIdx ]
- try:
- # create a Course instance
- course = Course( searchPageData['Name'][ courseIdx ], searchPageData['City'][ courseIdx ], searchPageData['State'][ courseIdx ], searchPageData['Holes'][ courseIdx ], searchPageData['link'][ courseIdx ] )
- # add the course data to the kml object
- addPoint( kml, course )
- except Exception as theE:
- print 'Error with: %s' % searchPageData['Name'][ courseIdx ]
- print theE.message
- # save the KML data
- kml.save( r'C:\kml\PDGA_Course_List_USA.kml' )
- return
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement