Guest User

Untitled

a guest
Jun 22nd, 2018
87
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.98 KB | None | 0 0
  1. import glob
  2. import os
  3. import re
  4. import pandas as pd
  5. import string
  6.  
  7. def tokenize(line):
  8. return line.split()
  9.  
  10. def normalize(line):
  11. table = line.maketrans('', '', string.punctuation)
  12. return line.translate(table)
  13.  
  14. def create_incidence_matrix(
  15. document_paths,
  16. tokenize=tokenize,
  17. normalize=normalize):
  18. """Create incidence matrix from texts under document paths.
  19. """
  20.  
  21. docs = []
  22.  
  23. # Collecting terms.
  24. for p in document_paths:
  25. terms = []
  26.  
  27. for txt in glob.glob(os.path.join(p, '*.txt')):
  28. with open(txt) as f:
  29. terms.append(sum(
  30. [tokenize(normalize(l)) for l in f], []))
  31.  
  32. docs.append(list(set(sum(terms, []))))
  33.  
  34. # Create dictionary.
  35. vocab = [w for w in set(sum(docs, []))]
  36.  
  37. # Create matrix.
  38. data = dict((os.path.basename(p),
  39. dict((w, 1 if w in doc else 0) for w in vocab))
  40. for p, doc in zip(document_paths, docs))
  41.  
  42. return pd.DataFrame(data)
Add Comment
Please, Sign In to add comment