Untitled

import glob
import os
import re
import pandas as pd
import string

def tokenize(line):
    return line.split()

def normalize(line):
    table = line.maketrans('', '', string.punctuation)
    return line.translate(table)

def create_incidence_matrix(
        document_paths,
        tokenize=tokenize,
        normalize=normalize):
    """Create incidence matrix from texts under document paths.
    """

    docs = []

    # Collecting terms.
    for p in document_paths:
        terms = []

        for txt in glob.glob(os.path.join(p, '*.txt')):
            with open(txt) as f:
                terms.append(sum(
                    [tokenize(normalize(l)) for l in f], []))

        docs.append(list(set(sum(terms, []))))

    # Create dictionary.
    vocab = [w for w in set(sum(docs, []))]

    # Create matrix.
    data = dict((os.path.basename(p),
                 dict((w, 1 if w in doc else 0) for w in vocab))
                for p, doc in zip(document_paths, docs))

    return pd.DataFrame(data)