a guest Jun 22nd, 2018 60 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
- import glob
- import os
- import re
- import pandas as pd
- import string
- def tokenize(line):
- return line.split()
- def normalize(line):
- table = line.maketrans('', '', string.punctuation)
- return line.translate(table)
- def create_incidence_matrix(
- """Create incidence matrix from texts under document paths.
- docs = 
- # Collecting terms.
- for p in document_paths:
- terms = 
- for txt in glob.glob(os.path.join(p, '*.txt')):
- with open(txt) as f:
- [tokenize(normalize(l)) for l in f], ))
- docs.append(list(set(sum(terms, ))))
- # Create dictionary.
- vocab = [w for w in set(sum(docs, ))]
- # Create matrix.
- data = dict((os.path.basename(p),
- dict((w, 1 if w in doc else 0) for w in vocab))
- for p, doc in zip(document_paths, docs))
- return pd.DataFrame(data)
RAW Paste Data