Advertisement
Guest User

Untitled

a guest
Apr 26th, 2017
62
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.65 KB | None | 0 0
  1.  
  2. def getCandidateSpeechLinks(url):
  3. allCandidatePage = requests.get(url)
  4. allCandidatePageSoup = BeautifulSoup(allCandidatePage.text, 'lxml')
  5. links={}
  6. table = allCandidatePageSoup.find('table', width=680)
  7. for area in table.findAll('td', class_='doctext'):
  8. for a in area.findAll('a'):
  9. if ('campaign' in a.text.lower()):
  10. links[area.find('span', class_='roman').text] = a['href']
  11. return links
  12.  
  13. def scrapeCampaignSpeechesToFile(url, path):
  14. allSpeechPages = requests.get(url)
  15. allSpeechSoup=BeautifulSoup(allSpeechPages.text, 'lxml')
  16. root = 'http://www.presidency.ucsb.edu/'
  17. table = allSpeechSoup.find('table', width=700)
  18. links = []
  19. for link in table.findAll('a'):
  20. if('interview' not in link.text.lower()):
  21. links.append(root+(link['href'])[3:])
  22.  
  23. speechPages = [requests.get(link , 'lxml')for link in links]
  24. speechesSoup = [BeautifulSoup(speechPage.text, 'lxml') for speechPage in speechPages]
  25.  
  26. with open(path, "w+", encoding='utf-8') as outFile:
  27. outFile.seek(0)
  28. for i,speech in enumerate(speechesSoup):
  29. text = speechesSoup[i].find('span', class_='displaytext').text.replace('.','. ')
  30. text = re.sub('\[[a-zA-Z]*\]', ' ', text)
  31. text = re.sub('[A-Z]+ [A-Z]+:', ' ', text)
  32. text = re.sub('\w+:', ' ', text)
  33. text = re.sub(r'[^\x00-\x7F]+',' ', text)
  34.  
  35. outFile.write(text +'\n')
  36.  
  37. def trainMarkov(path):
  38.  
  39. # Get raw text as string.
  40. with open(path, encoding='utf-8') as f:
  41. text = f.read()
  42.  
  43. # Build the model.
  44. text_model = markovify.Text(text)
  45. return text_model
  46.  
  47. def campaignLinkToBots(url, year):
  48.  
  49. dataFolder = './Campaign Speeches/'+ str(year) +'/'
  50.  
  51. if not os.path.exists(dataFolder):
  52. os.makedirs(dataFolder)
  53.  
  54. #Create the dictionary of each candidate's name and link to their campaign speech page
  55. campaignSpeechLinkDict = getCandidateSpeechLinks(url)
  56.  
  57. root = 'http://www.presidency.ucsb.edu/'
  58.  
  59. #Loops through the campagin speech links, puts each candidate's campagin speeches into individual files
  60. for name, url in campaignSpeechLinkDict.items():
  61. path = dataFolder + name.replace(' ', '-') + '.txt'
  62. if not os.path.isfile(path):
  63. scrapeCampaignSpeechesToFile(root + url, path)
  64.  
  65. #Train the bots and store them in a dictionary
  66. bots = {}
  67. for pres in campaignSpeechLinkDict.keys():
  68. bots[pres] = trainMarkov(dataFolder + pres.replace(' ', '-') + '.txt')
  69.  
  70. #return the bot dictionary
  71. return bots
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement