Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- import re
- def gen_indexes(txt):
- indexes=[]
- for index, line in enumerate(txt):
- if line == '':
- indexes.append(index-1)
- indexes.append(index)
- indexes.append(index+1)
- else:
- pass
- return indexes
- def gen_clean_txt(txt):
- indexes = gen_indexes(txt)
- return [line for index,line in enumerate(txt) if index not in indexes]
- def gen_list_airports(txt):
- re_coords = r'.*\d+ \d+ \d+[NS]\/\d+ \d+ \d+[WE]$'
- re_uf = r'.*[A-Z]{2} [A-Z]{4}.*' #re matching UF and airport symbol
- re_a = r'(.+)/ (.+), [A-Z]{2} [A-Z]{4}' #re matching the entire title (ie CITY NAME / Airport Name, UF CODE)
- airports = []
- for index, line in enumerate(txt):
- m1 = re.match(re_coords, line) #matches the coordinate in the title line
- m2 = re.match(re_uf,line) #matches the uf airport in the title line
- if m1 is not None and m2 is not None: #if both are a match, title page
- airports.append(index)
- elif m1 is not None and m2 is None: #if coords are a match but uf airport isnt look into the next line
- m2 = re.match(re_uf, txt[index+1])
- if m2 is not None:
- airports.append(index)
- else:
- 'seek jesus'
- blocks = [ txt[airports[index]:airports[index+1]] for index in range(len(airports)-1)]
- return blocks
- with open('ch3-pdfgrep.txt', 'r') as f:
- txt= [line.strip('\n') for line in f.readlines()]
- txt = gen_clean_txt(txt)
- airports = gen_list_airports(txt)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement