Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python
- import sys
- from sys import argv
- import collections
- import os
- import re
- import subprocess
- import pp
- script, nproc, filename1, filename2, filename3 = argv
- def count_lines(file):
- # print "Counting lines..."
- out = subprocess.Popen(['wc', '-l', file], stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()[0]
- line_count = int(out.split(' ')[1])
- #print(line_count)
- print line_count, "lines found."
- return line_count
- def splitFile(file, line_count):
- print "Partitioning files..."
- with open(file) as f:
- partition = line_count/4
- f = f.readlines()
- p1 = f[0:partition]
- # print(p1)
- p2 = f[partition + 1: partition * 2]
- # print(p2)
- p3 = f[partition * 2 + 1: partition * 3]
- # print(p3)
- p4 = f[partition * 3 + 1: line_count]
- # print(p4)
- partitions = (p1, p2, p3, p4)
- print len(partitions), "partitions created."
- return partitions
- def headerreplace(partition, file2, file3):
- # initialize dictionary
- d = {}
- with open(file2, 'w') as corrected, open(file3) as f:
- #create dictionary
- for line in f:
- line = line.rstrip()
- (key, val) = line.split(" ", 1)
- d[key] = val
- # parse original and print replacement to corrected
- count = 1
- for line in partition:
- #print line
- for key in d:
- if count < len(d):
- #print key
- if key in line:
- new_line = line.replace(key, d[key])
- #print d[key]
- corrected.write(new_line)
- # print new_line
- count = len(d)
- break
- else:
- count += 1
- elif count == len(d):
- corrected.write(line)
- count = 0
- # print line
- return "Finished job"
- # tuple of all parallel python servers to connect with
- ppservers = ()
- if len(sys.argv) > 1:
- ncpus = int(sys.argv[1])
- # Creates jobserver with ncpus workers
- job_server = pp.Server(ncpus, ppservers=ppservers)
- else:
- # Creates jobserver with automatically detected number of workers
- job_server = pp.Server(ppservers=ppservers)
- print "Starting pp with", job_server.get_ncpus(), "workers"
- def master():
- jobs = []
- line_count = count_lines(filename1)
- partitions = splitFile(filename1, line_count)
- corrected = ("corrected1.txt", "corrected2.txt", "corrected3.txt", "corrected4.txt")
- for partition, outfile in zip(partitions, corrected):
- jobs.append(job_server.submit(headerreplace, (partition, outfile, filename3)))
- for job in jobs:
- result = job()
- if result:
- print "Completed job"
- master()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement