daily pastebin goal
25%
SHARE
TWEET

Untitled

a guest Jun 22nd, 2018 60 Never
Upgrade to PRO!
ENDING IN00days00hours00mins00secs
  1. import glob
  2. import os
  3. import re
  4. import pandas as pd
  5. import string
  6.  
  7. def tokenize(line):
  8.     return line.split()
  9.  
  10. def normalize(line):
  11.     table = line.maketrans('', '', string.punctuation)
  12.     return line.translate(table)
  13.  
  14. def create_incidence_matrix(
  15.         document_paths,
  16.         tokenize=tokenize,
  17.         normalize=normalize):
  18.     """Create incidence matrix from texts under document paths.
  19.     """
  20.  
  21.     docs = []
  22.  
  23.     # Collecting terms.
  24.     for p in document_paths:
  25.         terms = []
  26.  
  27.         for txt in glob.glob(os.path.join(p, '*.txt')):
  28.             with open(txt) as f:
  29.                 terms.append(sum(
  30.                     [tokenize(normalize(l)) for l in f], []))
  31.  
  32.         docs.append(list(set(sum(terms, []))))
  33.  
  34.     # Create dictionary.
  35.     vocab = [w for w in set(sum(docs, []))]
  36.  
  37.     # Create matrix.
  38.     data = dict((os.path.basename(p),
  39.                  dict((w, 1 if w in doc else 0) for w in vocab))
  40.                 for p, doc in zip(document_paths, docs))
  41.  
  42.     return pd.DataFrame(data)
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top