Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def getCandidateSpeechLinks(url):
- allCandidatePage = requests.get(url)
- allCandidatePageSoup = BeautifulSoup(allCandidatePage.text, 'lxml')
- links={}
- table = allCandidatePageSoup.find('table', width=680)
- for area in table.findAll('td', class_='doctext'):
- for a in area.findAll('a'):
- if ('campaign' in a.text.lower()):
- links[area.find('span', class_='roman').text] = a['href']
- return links
- def scrapeCampaignSpeechesToFile(url, path):
- allSpeechPages = requests.get(url)
- allSpeechSoup=BeautifulSoup(allSpeechPages.text, 'lxml')
- root = 'http://www.presidency.ucsb.edu/'
- table = allSpeechSoup.find('table', width=700)
- links = []
- for link in table.findAll('a'):
- if('interview' not in link.text.lower()):
- links.append(root+(link['href'])[3:])
- speechPages = [requests.get(link , 'lxml')for link in links]
- speechesSoup = [BeautifulSoup(speechPage.text, 'lxml') for speechPage in speechPages]
- with open(path, "w+", encoding='utf-8') as outFile:
- outFile.seek(0)
- for i,speech in enumerate(speechesSoup):
- text = speechesSoup[i].find('span', class_='displaytext').text.replace('.','. ')
- text = re.sub('\[[a-zA-Z]*\]', ' ', text)
- text = re.sub('[A-Z]+ [A-Z]+:', ' ', text)
- text = re.sub('\w+:', ' ', text)
- text = re.sub(r'[^\x00-\x7F]+',' ', text)
- outFile.write(text +'\n')
- def trainMarkov(path):
- # Get raw text as string.
- with open(path, encoding='utf-8') as f:
- text = f.read()
- # Build the model.
- text_model = markovify.Text(text)
- return text_model
- def campaignLinkToBots(url, year):
- dataFolder = './Campaign Speeches/'+ str(year) +'/'
- if not os.path.exists(dataFolder):
- os.makedirs(dataFolder)
- #Create the dictionary of each candidate's name and link to their campaign speech page
- campaignSpeechLinkDict = getCandidateSpeechLinks(url)
- root = 'http://www.presidency.ucsb.edu/'
- #Loops through the campagin speech links, puts each candidate's campagin speeches into individual files
- for name, url in campaignSpeechLinkDict.items():
- path = dataFolder + name.replace(' ', '-') + '.txt'
- if not os.path.isfile(path):
- scrapeCampaignSpeechesToFile(root + url, path)
- #Train the bots and store them in a dictionary
- bots = {}
- for pres in campaignSpeechLinkDict.keys():
- bots[pres] = trainMarkov(dataFolder + pres.replace(' ', '-') + '.txt')
- #return the bot dictionary
- return bots
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement