Untitled


def getCandidateSpeechLinks(url):
    allCandidatePage = requests.get(url)
    allCandidatePageSoup = BeautifulSoup(allCandidatePage.text, 'lxml')
    links={}
    table = allCandidatePageSoup.find('table', width=680)
    for area in table.findAll('td', class_='doctext'):
        for a in area.findAll('a'):
            if ('campaign' in a.text.lower()):
                links[area.find('span', class_='roman').text] = a['href']
    return links

def scrapeCampaignSpeechesToFile(url, path):
    allSpeechPages = requests.get(url)
    allSpeechSoup=BeautifulSoup(allSpeechPages.text, 'lxml')
    root = 'http://www.presidency.ucsb.edu/'
    table = allSpeechSoup.find('table', width=700)
    links = []
    for link in table.findAll('a'):
        if('interview' not in link.text.lower()):
            links.append(root+(link['href'])[3:])

    speechPages = [requests.get(link , 'lxml')for link in links]
    speechesSoup = [BeautifulSoup(speechPage.text, 'lxml') for speechPage in speechPages]

    with open(path, "w+", encoding='utf-8') as outFile:
        outFile.seek(0)
        for i,speech in enumerate(speechesSoup):
            text = speechesSoup[i].find('span', class_='displaytext').text.replace('.','. ')
            text = re.sub('\[[a-zA-Z]*\]', ' ', text)
            text = re.sub('[A-Z]+ [A-Z]+:', ' ', text)
            text = re.sub('\w+:', ' ', text)
            text = re.sub(r'[^\x00-\x7F]+',' ', text)

            outFile.write(text +'\n')

def trainMarkov(path):

    # Get raw text as string.
    with open(path, encoding='utf-8') as f:
        text = f.read()

    # Build the model.
    text_model = markovify.Text(text)
    return text_model

def campaignLinkToBots(url, year):

    dataFolder = './Campaign Speeches/'+ str(year) +'/'

    if not os.path.exists(dataFolder):
        os.makedirs(dataFolder)

    #Create the dictionary of each candidate's name and link to their campaign speech page
    campaignSpeechLinkDict = getCandidateSpeechLinks(url)

    root = 'http://www.presidency.ucsb.edu/'

    #Loops through the campagin speech links, puts each candidate's campagin speeches into individual files
    for name, url in campaignSpeechLinkDict.items():
        path = dataFolder + name.replace(' ', '-') + '.txt'
        if not os.path.isfile(path):
            scrapeCampaignSpeechesToFile(root + url, path)

    #Train the bots and store them in a dictionary
    bots = {}
    for pres in campaignSpeechLinkDict.keys():
        bots[pres] = trainMarkov(dataFolder + pres.replace(' ', '-') + '.txt')

    #return the bot dictionary
    return bots