Untitled

# multi-html-to-kwic.py

import obo, os

#get all of the filenames:

#uncomment this line for PC
#trialDirectory = ('C:\\Documents and Settings\\Replace with Path to 'programming-historian/the-black' directory')

#uncomment this line for Mac
#trialDirectory = ('/Users/Replace with Path to 'programming-historian/the-black' directory')

trials = os.listdir(trialDirectory)
for files in trials:
    f = open(trialDirectory+files, 'r')
    #get the text of each trial and strip away HTML tags
    content = f.read().lower()
    text = obo.stripTags(content)
    f.close()

    # create dictionary of n-grams
    n = 7
    fullwordlist = ('# ' * (n//2)).split()
    fullwordlist += obo.stripNonAlphaNum(text)
    fullwordlist += ('# ' * (n//2)).split()
    ngrams = obo.getNGrams(fullwordlist, n)
    worddict = obo.nGramsToKWICDict(ngrams)

    # output KWIC and wrap with html
    target = 'black'
    outstr = ''
    if worddict.has_key(target):
        for k in worddict[target]:
            outstr += obo.prettyPrintKWIC(k)
            outstr += '\n'
    else:
        outstr += 'Keyword not found in source'
    print outstr