Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import glob
- import os
- import re
- import pandas as pd
- import string
- def tokenize(line):
- return line.split()
- def normalize(line):
- table = line.maketrans('', '', string.punctuation)
- return line.translate(table)
- def create_incidence_matrix(
- document_paths,
- tokenize=tokenize,
- normalize=normalize):
- """Create incidence matrix from texts under document paths.
- """
- docs = []
- # Collecting terms.
- for p in document_paths:
- terms = []
- for txt in glob.glob(os.path.join(p, '*.txt')):
- with open(txt) as f:
- terms.append(sum(
- [tokenize(normalize(l)) for l in f], []))
- docs.append(list(set(sum(terms, []))))
- # Create dictionary.
- vocab = [w for w in set(sum(docs, []))]
- # Create matrix.
- data = dict((os.path.basename(p),
- dict((w, 1 if w in doc else 0) for w in vocab))
- for p, doc in zip(document_paths, docs))
- return pd.DataFrame(data)
Add Comment
Please, Sign In to add comment