Advertisement
JORGE1963

Untitled

Aug 18th, 2018
107
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.14 KB | None | 0 0
  1. # multi-html-to-kwic.py
  2.  
  3. import obo, os
  4.  
  5. #get all of the filenames:
  6.  
  7. #uncomment this line for PC
  8. #trialDirectory = ('C:\\Documents and Settings\\Replace with Path to 'programming-historian/the-black' directory')
  9.  
  10. #uncomment this line for Mac
  11. #trialDirectory = ('/Users/Replace with Path to 'programming-historian/the-black' directory')
  12.  
  13. trials = os.listdir(trialDirectory)
  14. for files in trials:
  15. f = open(trialDirectory+files, 'r')
  16. #get the text of each trial and strip away HTML tags
  17. content = f.read().lower()
  18. text = obo.stripTags(content)
  19. f.close()
  20.  
  21. # create dictionary of n-grams
  22. n = 7
  23. fullwordlist = ('# ' * (n//2)).split()
  24. fullwordlist += obo.stripNonAlphaNum(text)
  25. fullwordlist += ('# ' * (n//2)).split()
  26. ngrams = obo.getNGrams(fullwordlist, n)
  27. worddict = obo.nGramsToKWICDict(ngrams)
  28.  
  29. # output KWIC and wrap with html
  30. target = 'black'
  31. outstr = ''
  32. if worddict.has_key(target):
  33. for k in worddict[target]:
  34. outstr += obo.prettyPrintKWIC(k)
  35. outstr += '\n'
  36. else:
  37. outstr += 'Keyword not found in source'
  38. print outstr
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement