Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def main():
- text = open(INPUT).read()
- print "Processing: %s" % (INPUT)
- output = open(OUTPUT, 'w')
- rows = rowcount(INPUT)
- print "Total rows in file: %s" % (rows)
- result = ''
- authors = []
- row = ''
- arow = ''
- columns = authorcount(INPUT)
- print "Max author count: %s" % (columns)
- matrix = Matrix(columns, rows)
- #print matrix
- try:
- for line in text.split('\n'):
- names = re.split(r'\d+: |[^,;.]{20,}', line)[1]
- authors.append(re.findall(r'\w+,? [\w.]+', names))
- result += re.sub(r'([^,;.]{20,}.*)', r'\t\1', line) + '\n'
- author = [a for a in authors[0]]
- desc = result.split('\t')
- for i in author:
- arow += i + '\t'
- row += arow + desc[1]
- output.write(row)
- buildMatrix(matrix, row, rows, columns)
- row = ''
- arow = ''
- result = ''
- names = ''
- author = []
- desc = []
- authors = []
- output.close()
- print "Wrote file: %s" % (OUTPUT)
- except Exception, err:
- print err
- def buildMatrix(matrix, row, rows, columns):
- mat = row.split('\t')
- arr = [columns]
- arr.insert(columns, mat[-1])
- print arr
- def rowcount(filename):
- lines = 0
- for line in open(filename):
- lines += 1
- return lines
- def authorcount(filename):
- getCount = open(filename).read()
- authors = []
- for line in getCount.split('\n'):
- names = re.split(r'\d+: |[^,;.]{20,}', line)[1]
- authors.append(re.findall(r'\w+,? [\w.]+', names))
- return max(len(x) for x in authors) + 1
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement