Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # multi-html-to-kwic.py
- import obo, os
- #get all of the filenames:
- #uncomment this line for PC
- #trialDirectory = ('C:\\Documents and Settings\\Replace with Path to 'programming-historian/the-black' directory')
- #uncomment this line for Mac
- #trialDirectory = ('/Users/Replace with Path to 'programming-historian/the-black' directory')
- trials = os.listdir(trialDirectory)
- for files in trials:
- f = open(trialDirectory+files, 'r')
- #get the text of each trial and strip away HTML tags
- content = f.read().lower()
- text = obo.stripTags(content)
- f.close()
- # create dictionary of n-grams
- n = 7
- fullwordlist = ('# ' * (n//2)).split()
- fullwordlist += obo.stripNonAlphaNum(text)
- fullwordlist += ('# ' * (n//2)).split()
- ngrams = obo.getNGrams(fullwordlist, n)
- worddict = obo.nGramsToKWICDict(ngrams)
- # output KWIC and wrap with html
- target = 'black'
- outstr = ''
- if worddict.has_key(target):
- for k in worddict[target]:
- outstr += obo.prettyPrintKWIC(k)
- outstr += '\n'
- else:
- outstr += 'Keyword not found in source'
- print outstr
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement