daily pastebin goal
68%
SHARE
TWEET

Untitled

a guest Jun 22nd, 2018 60 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import glob
  2. import os
  3. import re
  4. import pandas as pd
  5. import string
  6.  
  7. def tokenize(line):
  8.     return line.split()
  9.  
  10. def normalize(line):
  11.     table = line.maketrans('', '', string.punctuation)
  12.     return line.translate(table)
  13.  
  14. def create_incidence_matrix(
  15.         document_paths,
  16.         tokenize=tokenize,
  17.         normalize=normalize):
  18.     """Create incidence matrix from texts under document paths.
  19.     """
  20.  
  21.     docs = []
  22.  
  23.     # Collecting terms.
  24.     for p in document_paths:
  25.         terms = []
  26.  
  27.         for txt in glob.glob(os.path.join(p, '*.txt')):
  28.             with open(txt) as f:
  29.                 terms.append(sum(
  30.                     [tokenize(normalize(l)) for l in f], []))
  31.  
  32.         docs.append(list(set(sum(terms, []))))
  33.  
  34.     # Create dictionary.
  35.     vocab = [w for w in set(sum(docs, []))]
  36.  
  37.     # Create matrix.
  38.     data = dict((os.path.basename(p),
  39.                  dict((w, 1 if w in doc else 0) for w in vocab))
  40.                 for p, doc in zip(document_paths, docs))
  41.  
  42.     return pd.DataFrame(data)
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top