def main():
text = open(INPUT).read()
print "Processing: %s" % (INPUT)
output = open(OUTPUT, 'w')
rows = rowcount(INPUT)
print "Total rows in file: %s" % (rows)
result = ''
authors = []
row = ''
arow = ''
columns = authorcount(INPUT)
print "Max author count: %s" % (columns)
matrix = Matrix(columns, rows)
#print matrix
try:
for line in text.split('\n'):
names = re.split(r'\d+: |[^,;.]{20,}', line)[1]
authors.append(re.findall(r'\w+,? [\w.]+', names))
result += re.sub(r'([^,;.]{20,}.*)', r'\t\1', line) + '\n'
author = [a for a in authors[0]]
desc = result.split('\t')
for i in author:
arow += i + '\t'
row += arow + desc[1]
output.write(row)
buildMatrix(matrix, row, rows, columns)
row = ''
arow = ''
result = ''
names = ''
author = []
desc = []
authors = []
output.close()
print "Wrote file: %s" % (OUTPUT)
except Exception, err:
print err
def buildMatrix(matrix, row, rows, columns):
mat = row.split('\t')
arr = [columns]
arr.insert(columns, mat[-1])
print arr
def rowcount(filename):
lines = 0
for line in open(filename):
lines += 1
return lines
def authorcount(filename):
getCount = open(filename).read()
authors = []
for line in getCount.split('\n'):
names = re.split(r'\d+: |[^,;.]{20,}', line)[1]
authors.append(re.findall(r'\w+,? [\w.]+', names))
return max(len(x) for x in authors) + 1