def main(): text = open(INPUT).read() print "Processing: %s" % (INPUT) output = open(OUTPUT, 'w') rows = rowcount(INPUT) print "Total rows in file: %s" % (rows) result = '' authors = [] row = '' arow = '' columns = authorcount(INPUT) print "Max author count: %s" % (columns) matrix = Matrix(columns, rows) #print matrix try: for line in text.split('\n'): names = re.split(r'\d+: |[^,;.]{20,}', line)[1] authors.append(re.findall(r'\w+,? [\w.]+', names)) result += re.sub(r'([^,;.]{20,}.*)', r'\t\1', line) + '\n' author = [a for a in authors[0]] desc = result.split('\t') for i in author: arow += i + '\t' row += arow + desc[1] output.write(row) buildMatrix(matrix, row, rows, columns) row = '' arow = '' result = '' names = '' author = [] desc = [] authors = [] output.close() print "Wrote file: %s" % (OUTPUT) except Exception, err: print err def buildMatrix(matrix, row, rows, columns): mat = row.split('\t') arr = [columns] arr.insert(columns, mat[-1]) print arr def rowcount(filename): lines = 0 for line in open(filename): lines += 1 return lines def authorcount(filename): getCount = open(filename).read() authors = [] for line in getCount.split('\n'): names = re.split(r'\d+: |[^,;.]{20,}', line)[1] authors.append(re.findall(r'\w+,? [\w.]+', names)) return max(len(x) for x in authors) + 1