Advertisement
Guest User

Untitled

a guest
Jan 17th, 2017
107
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 15.10 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on Thu Jun 23 22:14:36 2016
  4.  
  5. @author: Michael
  6. """
  7.  
  8. #QUIZ: ITERATIVE PARSING
  9.  
  10. import xml.etree.cElementTree as ET
  11. from pprint import pprint
  12.  
  13. tag_names = {}
  14.  
  15. def count_tags(filename):
  16. for event, element in ET.iterparse(filename):
  17. if element.tag not in tag_names:
  18. tag_names[element.tag] = 0
  19. if element.tag in tag_names:
  20. tag_names[element.tag] = (tag_names[element.tag]+1)
  21. print tag_names
  22.  
  23. def test():
  24.  
  25. tags = count_tags('example.osm')
  26. pprint(tags)
  27. assert tags == {'bounds': 1,
  28. 'member': 3,
  29. 'nd': 4,
  30. 'node': 20,
  31. 'osm': 1,
  32. 'relation': 1,
  33. 'tag': 7,
  34. 'way': 1}
  35.  
  36. if __name__ == "__main__":
  37. test()
  38.  
  39.  
  40. #QUIZ: EPLORING USERS
  41.  
  42. #!/usr/bin/env python
  43. # -*- coding: utf-8 -*-
  44. import xml.etree.cElementTree as ET
  45. from pprint import pprint
  46. import re
  47.  
  48. def process_map(filename):
  49. users = []
  50. for event, element in ET.iterparse(filename):
  51. for tag in element.iter("node"):
  52. if tag.attrib['uid'] not in users:
  53. users.append(element.attrib['uid'])
  54. return users
  55.  
  56. def test():
  57.  
  58. users = process_map('example.osm')
  59. pprint(users)
  60. print len(users)
  61. assert len(users) == 6
  62.  
  63.  
  64.  
  65. if __name__ == "__main__":
  66. test()
  67.  
  68.  
  69. #QUIZ: IMPROVING STREET NAMES
  70.  
  71. import xml.etree.cElementTree as ET
  72. from collections import defaultdict
  73. import re
  74. from pprint import pprint
  75.  
  76. OSMFILE = "example.osm"
  77.  
  78. #regular expression for
  79. street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
  80.  
  81. #All the expected street types (the correct ones)
  82. expected = ["Alley", "Crescent", "Loop", "Street", "Avenue", "Plaza", "Boulevard", "Drive",
  83. "Court", "Terrace", "Walk", "Place", "Square", "Lane", "Road", "Parkway", "Highway",
  84. "Trail", "Commons", "Americas", "Center", "Circle", "Close", "East", "Expressway",
  85. "Extension", "Gardens", "Heights", "Island", "North", "West", "South", "Park", "Path", "Promenade",
  86. "Slip", "Row", "Rockaways", "Southwest", "Turnpike", "X", "Y", "Z"]
  87.  
  88. # All the errors, or non-uniform street types I caught
  89. mapping = { "St": "Street",
  90. "St.": "Street",
  91. "Ave": "Avenue",
  92. "Ave.": "Avenue",
  93. "ave": "Avenue",
  94. "Pkwy": "Parkway",
  95. "Blvd": "Boulevard",
  96. "Pl": "Place",
  97. "Hwy": "Highway",
  98. "Dr": "Drive",
  99. "Rd": "Road",
  100. "Avenue,": "Avenue",
  101. "Plz": "Place",
  102. "ST": "Street",
  103. "Streeet":"Street",
  104. "avenue":"Avenue",
  105. "Steet": "Street"
  106. }
  107.  
  108.  
  109. #checks if a street_type is in the expected list, if not add it to its own group
  110. def audit_street_type(street_types, street_name):
  111. m = street_type_re.search(street_name)
  112. if m:
  113. street_type = m.group()
  114. if street_type not in expected:
  115. street_types[street_type].add(street_name)
  116.  
  117. #checks if the element is a street name
  118. def is_street_name(elem):
  119. return (elem.attrib['k'] == "addr:street")
  120.  
  121.  
  122. #makes street_types a default dictionary of sets of elements
  123. #parses through the osm file and if a tag in a node or way is found, it checks if that tag is a street, then checks if its in the expected list
  124. def audit(osmfile):
  125. osm_file = open(osmfile, "r")
  126. street_types = defaultdict(set)
  127. for event, elem in ET.iterparse(osm_file, events=("start",)):
  128.  
  129. if elem.tag == "node" or elem.tag == "way":
  130. for tag in elem.iter("tag"):
  131. if is_street_name(tag):
  132. audit_street_type(street_types, tag.attrib['v'])
  133. osm_file.close()
  134. return street_types
  135.  
  136. #takes in each name, and knows the appropriate mappings if it is not a correct name
  137. #if the name is a regular street type, put it in a group, check if it needs to be updated, change its name if necessary
  138. #if its a street type but its not one that needs changing, put in a list of other street types
  139. def update_name(name, mapping):
  140.  
  141. m = street_type_re.search(name)
  142. other_street_types = []
  143. if m:
  144. street_type = m.group()
  145. if street_type in mapping.keys():
  146. name = re.sub(street_type, mapping[street_type], name)
  147. else:
  148. other_street_types.append(street_type)
  149.  
  150. return name
  151.  
  152. #first, print the dictionary of street types, in expected and now
  153. #then, check all the street names to check which need updating. For those, print the old name, then the new name
  154. def test():
  155. st_types = audit(OSMFILE)
  156. # assert len(st_types) == 3
  157. # pprint(dict(st_types))
  158.  
  159. for st_type, ways in st_types.iteritems():
  160. for name in ways:
  161. better_name = update_name(name, mapping)
  162. print name, "=>", better_name
  163. # if name == "West Lexington St.":
  164. # assert better_name == "West Lexington Street"
  165. # if name == "Baldwin Rd.":
  166. # assert better_name == "Baldwin Road"
  167.  
  168.  
  169. if __name__ == '__main__':
  170. test()
  171.  
  172.  
  173. #QUIZ: PREPARING FOR DATABASE
  174.  
  175.  
  176. import csv
  177. import codecs
  178. import re
  179. import xml.etree.cElementTree as ET
  180. import cerberus
  181. import schema
  182.  
  183. #OSM file of booklyn
  184. OSM_PATH = "brooklyn_new-york.osm"
  185.  
  186. #all the csv files that will be created
  187. NODES_PATH = "nodes.csv"
  188. NODE_TAGS_PATH = "nodes_tags.csv"
  189. WAYS_PATH = "ways.csv"
  190. WAY_NODES_PATH = "ways_nodes.csv"
  191. WAY_TAGS_PATH = "ways_tags.csv"
  192.  
  193. #regular expressions, lower colon accounts for a :, problemchars accounts for all problematic characters #$%^
  194. LOWER_COLON = re.compile(r'^([a-z]|_)+:([a-z]|_)+')
  195. PROBLEMCHARS = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
  196.  
  197. #this is the defined schema from Udacity quiz:preparing for database (saved locally)
  198. SCHEMA = schema.schema
  199.  
  200. #all the expected street types
  201. expected = ["Alley", "Crescent", "Loop", "Street", "Avenue", "Plaza", "Boulevard", "Drive",
  202. "Court", "Terrace", "Walk", "Place", "Square", "Lane", "Road", "Parkway", "Highway",
  203. "Trail", "Commons", "Americas", "Center", "Circle", "Close", "East", "Expressway",
  204. "Extension", "Gardens", "Heights", "Island", "North", "West", "South", "Park", "Path", "Promenade",
  205. "Slip", "Row", "Rockaways", "Southwest", "Turnpike", "X", "Y", "Z"]
  206.  
  207. #all the problematic or non-uniform street types I found
  208. mapping = { "St": "Street",
  209. "St.": "Street",
  210. "Ave": "Avenue",
  211. "Ave.": "Avenue",
  212. "ave": "Avenue",
  213. "Pkwy": "Parkway",
  214. "Blvd": "Boulevard",
  215. "Pl": "Place",
  216. "Hwy": "Highway",
  217. "Dr": "Drive",
  218. "Rd": "Road",
  219. "Avenue,": "Avenue",
  220. "Plz": "Place",
  221. "ST": "Street",
  222. "Streeet":"Street",
  223. "avenue":"Avenue",
  224. "Steet": "Street"
  225. }
  226.  
  227. #The column headers of each csv file
  228. # Make sure the fields order in the csv's matches the column order in the sql table schema
  229. NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
  230. NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
  231. WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
  232. WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
  233. WAY_NODES_FIELDS = ['id', 'node_id', 'position']
  234.  
  235.  
  236. #check if k is in the right place using the colon as a reference
  237. def is_k_in_right_place(k):
  238. place=k.find(':')
  239. correct=k[:place]
  240. k=k[place+1:]
  241. return k,correct
  242.  
  243.  
  244. def shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS,
  245. problem_chars=PROBLEMCHARS, default_tag_type='regular'):
  246. """Clean and shape node or way XML element to Python dict"""
  247. node_attribs = {}
  248. way_attribs = {}
  249. way_nodes = []
  250. tags = [] # Handle secondary tags the same way for both node and way elements
  251.  
  252. #First, when top level tag is a node...
  253. if element.tag == 'node':
  254. #then for each attribute in that node that we will put into the csv as a column header..
  255. for i in node_attr_fields:
  256. #set dictionary i to that attribute
  257. node_attribs[i]=element.attrib[i]
  258.  
  259. #Now, when top level tag is a way...
  260. if element.tag=='way':
  261. #for each attribute in that way that will be a column header..
  262. for i in way_attr_fields:
  263. way_attribs[i]=element.attrib[i]
  264.  
  265. #for each node or way tag..
  266. for tag in element.iter("tag"):
  267. # use 'if is_street_name()' function to determine if the attribute matches
  268.  
  269. #make a blank dictionary
  270. dic = {}
  271. #set attributes = all the attributes of that tag
  272. attributes = tag.attrib
  273.  
  274. if is_street_name(tag):
  275. print 'BEFORE'
  276. print tag.attrib['v']
  277. # update tag.attrib['v'] with the return from update_name()
  278. tag.attrib['v']=update_name(tag.attrib['v'],mapping)
  279. print 'AFTER'
  280. print tag.attrib['v']
  281.  
  282. #if the k attribute has some problem characters, continue
  283. if problem_chars.search(tag.attrib['k']):
  284. continue
  285.  
  286. #if the tag is a node
  287. if element.tag=='node':
  288. #set id in the blank dictionary to the id from the node attributes
  289. dic['id']=node_attribs['id']
  290. #but if the tag doesn't = node, set the dictionary id to the id from the way attributes
  291. else:
  292. dic['id']=way_attribs['id']
  293.  
  294. #make the value always equal to the v attribute from the tag element
  295. dic['value'] = attributes['v']
  296.  
  297. #if the tag attribute k has a colon
  298. colon_k=LOWER_COLON.search(tag.attrib['k'])
  299. if colon_k:
  300. #then print the attributes k's group, print that attribute, and make the key and type = k, correct
  301. print colon_k.group(0)
  302. print tag.attrib['k']
  303. dic['key'],dic['type']=is_k_in_right_place(tag.attrib['k'])
  304. #if attribute k does not have a colon, make the key equal to k and the type equal to regular
  305. else:
  306. dic['key']=attributes['k']
  307. dic['type']='regular'
  308.  
  309. #append the dic to the empty tags list
  310. tags.append(dic)
  311.  
  312. #if the tag = way
  313. if element.tag=='way':
  314. position=0
  315. for nd in element.iter("nd"):
  316. way_node_dic={}
  317. way_node_dic['id']=way_attribs['id']
  318. way_node_dic['node_id']=nd.attrib['ref']
  319. way_node_dic['position']=position
  320. position = position + 1
  321. way_nodes.append(way_node_dic)
  322.  
  323. if element.tag == 'node':
  324. return {'node': node_attribs, 'node_tags': tags}
  325. elif element.tag == 'way':
  326. return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}
  327.  
  328.  
  329. # ================================================== #
  330. # Helper Functions #
  331. # ================================================== #
  332. def get_element(osm_file, tags=('node', 'way', 'relation')):
  333. """Yield element if it is the right type of tag"""
  334.  
  335. context = ET.iterparse(osm_file, events=('start', 'end'))
  336. _, root = next(context)
  337. for event, elem in context:
  338. if event == 'end' and elem.tag in tags:
  339. yield elem
  340. root.clear()
  341.  
  342.  
  343. def validate_element(element, validator, schema=SCHEMA):
  344. """Raise ValidationError if element does not match schema"""
  345. if validator.validate(element, schema) is not True:
  346. field, errors = next(validator.errors.iteritems())
  347. message_string = "\nElement of type '{0}' has the following errors:\n{1}"
  348. error_strings = (
  349. "{0}: {1}".format(k, v if isinstance(v, str) else ", ".join(v))
  350. for k, v in errors.iteritems()
  351. )
  352. raise cerberus.ValidationError(
  353. message_string.format(field, "\n".join(error_strings))
  354. )
  355.  
  356.  
  357. class UnicodeDictWriter(csv.DictWriter, object):
  358. """Extend csv.DictWriter to handle Unicode input"""
  359.  
  360. def writerow(self, row):
  361. super(UnicodeDictWriter, self).writerow({
  362. k: (v.encode('utf-8') if isinstance(v, unicode) else v) for k, v in row.iteritems()
  363. })
  364.  
  365. def writerows(self, rows):
  366. for row in rows:
  367. self.writerow(row)
  368.  
  369. street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
  370.  
  371.  
  372. def audit_street_type(street_types, street_name):
  373. m = street_type_re.search(street_name)
  374. if m:
  375. street_type = m.group()
  376. if street_type not in expected:
  377. street_types[street_type].add(street_name)
  378.  
  379.  
  380. def is_street_name(elem):
  381. return (elem.attrib['k'] == "addr:street")
  382.  
  383.  
  384. def audit(osmfile):
  385. osm_file = open(osmfile, "r")
  386. street_types = defaultdict(set)
  387. for event, elem in ET.iterparse(osm_file, events=("start",)):
  388.  
  389. if elem.tag == "node" or elem.tag == "way":
  390. for tag in elem.iter("tag"):
  391. if is_street_name(tag):
  392. audit_street_type(street_types, tag.attrib['v'])
  393. osm_file.close()
  394. return street_types
  395.  
  396.  
  397. def update_name(name, mapping):
  398.  
  399. m = street_type_re.search(name)
  400. other_street_types = []
  401. if m:
  402. street_type = m.group()
  403. if street_type in mapping.keys():
  404. name = re.sub(street_type, mapping[street_type], name)
  405. else:
  406. other_street_types.append(street_type)
  407.  
  408. return name
  409.  
  410.  
  411. # ================================================== #
  412. # Main Function #
  413. # ================================================== #
  414. def process_map(file_in, validate):
  415. """Iteratively process each XML element and write to csv(s)"""
  416.  
  417. with codecs.open(NODES_PATH, 'w') as nodes_file, \
  418. codecs.open(NODE_TAGS_PATH, 'w') as nodes_tags_file, \
  419. codecs.open(WAYS_PATH, 'w') as ways_file, \
  420. codecs.open(WAY_NODES_PATH, 'w') as way_nodes_file, \
  421. codecs.open(WAY_TAGS_PATH, 'w') as way_tags_file:
  422.  
  423. nodes_writer = UnicodeDictWriter(nodes_file, NODE_FIELDS)
  424. node_tags_writer = UnicodeDictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
  425. ways_writer = UnicodeDictWriter(ways_file, WAY_FIELDS)
  426. way_nodes_writer = UnicodeDictWriter(way_nodes_file, WAY_NODES_FIELDS)
  427. way_tags_writer = UnicodeDictWriter(way_tags_file, WAY_TAGS_FIELDS)
  428.  
  429. nodes_writer.writeheader()
  430. node_tags_writer.writeheader()
  431. ways_writer.writeheader()
  432. way_nodes_writer.writeheader()
  433. way_tags_writer.writeheader()
  434.  
  435. validator = cerberus.Validator()
  436.  
  437. for element in get_element(file_in, tags=('node', 'way')):
  438. el = shape_element(element)
  439. if el:
  440. if validate is True:
  441. validate_element(el, validator)
  442.  
  443. if element.tag == 'node':
  444. nodes_writer.writerow(el['node'])
  445. node_tags_writer.writerows(el['node_tags'])
  446. elif element.tag == 'way':
  447. ways_writer.writerow(el['way'])
  448. way_nodes_writer.writerows(el['way_nodes'])
  449. way_tags_writer.writerows(el['way_tags'])
  450.  
  451.  
  452. if __name__ == '__main__':
  453. # Note: Validation is ~ 10X slower. For the project consider using a small
  454. # sample of the map when validating.
  455. process_map(OSM_PATH, validate=True)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement