Advertisement
Guest User

Scraping

a guest
Sep 4th, 2015
212
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.01 KB | None | 0 0
  1. import tweepy
  2. import re
  3. import requests
  4. import untangle
  5.  
  6.  
  7. def getTwitterResults():
  8.  
  9.     listOfPossibleLocations = []
  10.  
  11.     #Authentification keys nessccesary to utilize the Twitter API:
  12.     consumer_key = 'Key'
  13.     consumer_secret = 'Secret'
  14.     access_token = 'Token'
  15.     access_token_secret = 'Secret'
  16.  
  17.     auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
  18.     auth.set_access_token(access_token, access_token_secret)
  19.  
  20.     api = tweepy.API(auth)
  21.  
  22.     #Specifying which users timeline to scrap, and how many tweets:
  23.     timeline = api.user_timeline(id = 'rogalandops', count = 5)
  24.  
  25.     for status in timeline:
  26.         update = (status.text+'\n')
  27.         print(update) #Printing the tweets
  28.  
  29.         #Regex to search the tweets for place names
  30.         mostcomplex_match = re.findall(r'[A-Z]\w*\s+\d+\s+\w*', update)
  31.         medium_complex_match = re.findall(r'[A-Z]\w*\s+\d+\s', update)
  32.         least_complex_match = re.findall(r'\b[A-Z].*?\b', update)
  33.         another_try = re.findall(r'[A-Z]\S+', update)
  34.  
  35.         if mostcomplex_match:
  36.             print(mostcomplex_match)
  37.             print('most complex')
  38.             listOfPossibleLocations.append(mostcomplex_match)
  39.  
  40.         elif medium_complex_match:
  41.             print(medium_complex_match)
  42.             print('medium')
  43.             listOfPossibleLocations.append(medium_complex_match)
  44.  
  45.         elif another_try:
  46.             print(another_try)
  47.             print('another try')
  48.             listOfPossibleLocations.append(another_try)
  49.  
  50.         elif least_complex_match:
  51.             print(least_complex_match)
  52.             print('least complex')
  53.             listOfPossibleLocations.append(least_complex_match)
  54.  
  55.         else:
  56.             print('No match')
  57.  
  58.     return listOfPossibleLocations
  59.  
  60. def getPlaceCoordinates(listOfPossibleLocations):
  61.     """Function gets the list of possible place names from getTwitterResults
  62.        and tries to find coordinates using kartverkets place search"""
  63.  
  64.     #Converting nested list to single list
  65.     singlelist = [item for sublist in listOfPossibleLocations for item in sublist]
  66.     print(singlelist)
  67.  
  68.     for words in singlelist:
  69.  
  70.         #print(words)
  71.  
  72.         #search_string = str(words)
  73.         try:
  74.             url = 'https://ws.geonorge.no/SKWS3Index/ssr/sok?navn='+words
  75.  
  76.  
  77.             obj = untangle.parse(url)
  78.             kommunenavn = obj.sokRes.stedsnavn.kommunenavn.cdata.encode('ascii', 'replace')
  79.             stedsnavn = obj.sokRes.stedsnavn.stedsnavn.cdata.encode('ascii', 'replace')
  80.             aust = obj.sokRes.stedsnavn.aust.cdata.encode('ascii', 'replace')
  81.             nord = obj.sokRes.stedsnavn.nord.cdata.encode('ascii', 'replace')
  82.             print(kommunenavn, stedsnavn, aust, nord)
  83.  
  84.         except (AttributeError,UnicodeEncodeError, IndexError) as e:
  85.             print(e)
  86.             pass
  87.  
  88.         else:
  89.             print(kommunenavn, stedsnavn, aust, nord)
  90.  
  91.  
  92. locations = getTwitterResults()
  93. coordinates = getPlaceCoordinates(locations)
  94.  
  95. #print(locations, coordinates)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement