Advertisement
Guest User

Untitled

a guest
Jan 16th, 2018
66
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 18.61 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2.  
  3. import copy
  4. import glob
  5. import itertools
  6. import math
  7. import shutil
  8.  
  9. import pandas as pd
  10. import scrapy
  11. from bs4 import BeautifulSoup
  12. from scrapy.http import FormRequest
  13. from scrapy.http import Request
  14.  
  15. from definitions import *
  16. from util.google_maps import get_route
  17. from .helper import *
  18. from .models import *
  19. from .properties import *
  20. import json
  21.  
  22.  
  23. class SingaporeTransitFeedSpider(scrapy.Spider):
  24. name = 'singapore-transit-feed'
  25. allowed_domains = ['transitlink.com.sg']
  26. start_urls = ['https://www.transitlink.com.sg/eservice/eguide/rail_idx.php']
  27. custom_settings = {
  28. 'ITEM_PIPELINES': {
  29. 'spider.pipelines.SingaporeGTFSPipeline': 300
  30. }
  31. }
  32.  
  33. def parse(self, response):
  34. self.logger.info("Start processing GTFS Singapore...")
  35.  
  36. agencies = self.crawl_agencies()
  37. yield {'agencies': agencies}
  38.  
  39. routes = self.crawl_routes()
  40. yield {'routes': routes}
  41.  
  42. calendars = self.crawl_calendars()
  43. yield {'calendars': calendars}
  44.  
  45. trips = self.crawl_trips()
  46. yield {'trips': trips}
  47.  
  48. stops = self.crawl_stops(response)
  49. yield {'stops': stops}
  50.  
  51. shapes = self.caculate_shape(response)
  52. yield {'shapes': shapes}
  53.  
  54. calendar_dates = self.crawl_calendar_dates(calendars)
  55. yield {'calendar_dates': calendar_dates}
  56.  
  57. yield self.crawl_stop_times(routes, stops, trips)
  58.  
  59. def crawl_agencies(self):
  60. agencies = list()
  61. agencies.append(Agency("MRT", "Mass Rapid Transit", "https://mrt.sg/", "Asia/Singapore", "EN"))
  62. agencies.append(Agency("LRT", "Light Rail Transit", "https://mrt.sg/", "Asia/Singapore", "EN"))
  63. return agencies
  64.  
  65. def crawl_routes(self):
  66. routes = list()
  67. for key, value in SINGAPORE_MRT.items():
  68. routes.append(Route('%s_0' % key, 'MRT', key, '%s 0' % value, 1))
  69. routes.append(Route('%s_1' % key, 'MRT', key, '%s 1' % value, 1))
  70.  
  71. for key, value in SINGAPORE_MRT_NO_LOOP.items():
  72. routes.append(Route('%s' % key, 'MRT', key, value, 1))
  73.  
  74.  
  75. for key, value in SINGAPORE_LRT.items():
  76. routes.append(Route(key, 'LRT', key, value, 1))
  77. return routes
  78.  
  79. def crawl_calendars(self):
  80. calendar_list = []
  81. calendar_list += [Calendar(
  82. (CALENDAR_PROPERTIES['mrt_service_prefix'] + CALENDAR_PROPERTIES['lrt_service_prefix'])[index] + '_' +
  83. CALENDAR_PROPERTIES['calendar_suffix'][calendar_index],
  84. 1 if CALENDAR_PROPERTIES['calendar_suffix'][calendar_index] == 'Weekday' else 0,
  85. 1 if CALENDAR_PROPERTIES['calendar_suffix'][calendar_index] == 'Weekday' else 0,
  86. 1 if CALENDAR_PROPERTIES['calendar_suffix'][calendar_index] == 'Weekday' else 0,
  87. 1 if CALENDAR_PROPERTIES['calendar_suffix'][calendar_index] == 'Weekday' else 0,
  88. 1 if CALENDAR_PROPERTIES['calendar_suffix'][calendar_index] == 'Weekday' else 0,
  89. 1 if CALENDAR_PROPERTIES['calendar_suffix'][calendar_index] == 'Saturday' else 0,
  90. 1 if CALENDAR_PROPERTIES['calendar_suffix'][calendar_index] == 'Sunday' else 0,
  91. CALENDAR_PROPERTIES['calendar_start_date'],
  92. CALENDAR_PROPERTIES['calendar_end_date'])
  93. for index in
  94. (range(len(CALENDAR_PROPERTIES['mrt_service_prefix'] + CALENDAR_PROPERTIES['lrt_service_prefix'])))
  95. for calendar_index in (range(len(CALENDAR_PROPERTIES['calendar_suffix'])))]
  96. return calendar_list
  97.  
  98. def crawl_trips(self):
  99. station_defines = StopCodeCrawler(
  100. 'https://www.mytransport.sg/content/mytransport/home/commuting/trainservices.html?myRad=8#Train_Timetable').get_stop_code_from_time_table()
  101. for trip_headsign in MRT_TRIP_HEADSIGN_DEFINES.values():
  102. for station in station_defines:
  103. if (trip_headsign.split(" - ")[0] == station.stop_name):
  104. arr_station_code = station.stop_code
  105. if (trip_headsign.split(" - ")[1] == station.stop_name):
  106. dep_station_code = station.stop_code
  107.  
  108. route_id = list(MRT_TRIP_HEADSIGN_DEFINES.keys())[
  109. list(MRT_TRIP_HEADSIGN_DEFINES.values()).index(trip_headsign)]
  110. timeTableCrawler = TimeTableCrawler(
  111. 'https://www.mytransport.sg/content/mytransport/home/commuting/trainservices/jcr:content/par3/mrt_info.ajax.getTrainInformation.%s.%s.%s.%s.html' %
  112. (arr_station_code,
  113. dep_station_code,
  114. route_id[:2],
  115. trip_headsign.replace(' - ', '.')))
  116. timeTables = timeTableCrawler.crawl(route_id, trip_headsign)
  117. trips = self.trip_process(timeTables + LRT_TIMETABLE_DEFINE)
  118. return trips
  119.  
  120. def crawl_calendar_dates(self, calendars):
  121. calendar_dates = list()
  122. for calendar in calendars:
  123. for day in SINGAPORE_HOLIDAYS:
  124. calendar_dates.append(CalendarDate(calendar.service_id, day, 1))
  125. return calendar_dates
  126.  
  127. def crawl_stops(self, response):
  128. stations = response.css('select[name="mrtcode_start"]').extract()[0]
  129. soup = BeautifulSoup(stations, 'html')
  130. L = list()
  131. coord = pd.read_csv(SINGAPORE_MRT_INPUT + "/sg_mrt.csv", encoding='utf-8',
  132. dtype={'stn_no': str, 'stn_line': str, 'lat': str, 'lon': str}, sep=';').fillna('')
  133. for i, row_coord in coord.iterrows():
  134. for option in soup.find_all('option')[1:]:
  135. code = row_coord['stn_line'] + row_coord['stn_no']
  136. if code in option.text.rsplit(' ', 1)[1][1:-1].split('/'):
  137. L.append(Stop(stop_id=option.text.rsplit(' ', 1)[1][1:-1], stop_name=option.text.rsplit(' ', 1)[0],
  138. stop_lat=row_coord['lat'], stop_lon=row_coord['lon'],
  139. stop_desc=option.text))
  140. break
  141. df = pd.DataFrame().from_records([obj.__dict__ for obj in L]).groupby(['stop_name']).first().reset_index()
  142. stops = []
  143. for i, row in df.iterrows():
  144. stops.append(Stop(stop_id=row['stop_id'], stop_name=row['stop_name'],
  145. stop_lat=row['stop_lat'], stop_lon=row['stop_lon'], stop_desc=row['stop_desc']))
  146. return stops
  147.  
  148. def caculate_shape(self, response):
  149. coord = pd.read_csv(SINGAPORE_MRT_INPUT + "/sg_mrt.csv", encoding='utf-8', dtype={'stn_no': str, 'stn_line':
  150. str, 'lat': str, 'lon': str}, sep=';').fillna('')
  151. k = 0
  152.  
  153. shapes = list()
  154. # Caculate lrt
  155. for i, row_coord in coord.iterrows():
  156. if (i == len(coord) - 1) or not (row_coord['stn_line'] in ["PW","PE","SW","SE"]): break
  157. if coord.iloc[i]['stn_line'] == coord.iloc[i + 1]['stn_line']:
  158. if ('PW' == coord.iloc[i]['stn_line'] or 'PE' == coord.iloc[i]['stn_line']) and row_coord[
  159. 'stn_no'] == '1':
  160. for m, coord_case_spc1 in enumerate(
  161. get_route(coord.loc[coord['stn_line'] == 'PTC']['lat'].values[0] + ',' +
  162. coord.loc[coord['stn_line'] == 'PTC']['lon'].values[0],
  163. coord.iloc[i]['lat'] + ',' + coord.iloc[i]['lon'],
  164. mode='transit')):
  165. k = k + 1
  166. shapes.append(
  167. Shape(shape_id=row_coord['stn_line'],
  168. shape_pt_lat=coord_case_spc1[0],
  169. shape_pt_lon=coord_case_spc1[1],
  170. shape_pt_sequence=k))
  171. if ('SW' == coord.iloc[i]['stn_line'] or 'SE' == coord.iloc[i]['stn_line']) and row_coord[
  172. 'stn_no'] == '1':
  173. for m, coord_case_spc1 in enumerate(
  174. get_route(coord.loc[coord['stn_line'] == 'STC']['lat'].values[0] + ',' +
  175. coord.loc[coord['stn_line'] == 'STC']['lon'].values[0],
  176. coord.iloc[i]['lat'] + ',' + coord.iloc[i]['lon'],
  177. mode='transit')):
  178. k = k + 1
  179. shapes.append(
  180. Shape(shape_id=row_coord['stn_line'],
  181. shape_pt_lat=coord_case_spc1[0],
  182. shape_pt_lon=coord_case_spc1[1],
  183. shape_pt_sequence=k))
  184.  
  185. for l, coord_sp in enumerate(
  186. get_route(coord.iloc[i]['lat'] + ',' + coord.iloc[i]['lon'],
  187. coord.iloc[i + 1]['lat'] + ',' + coord.iloc[i + 1]['lon'],
  188. mode='transit')):
  189. k = k + 1
  190. shapes.append(
  191. Shape(shape_id=row_coord['stn_line'],
  192. shape_pt_lat=coord_sp[0],
  193. shape_pt_lon=coord_sp[1],
  194. shape_pt_sequence=k))
  195.  
  196. else:
  197. if 'PW' == coord.iloc[i]['stn_line'] or 'PE' == coord.iloc[i]['stn_line']:
  198. for m, coord_case_spc1 in enumerate(
  199. get_route(coord.iloc[i]['lat'] + ',' + coord.iloc[i]['lon'],
  200. coord.loc[coord['stn_line'] == 'PTC']['lat'].values[0] + ',' +
  201. coord.loc[coord['stn_line'] == 'PTC']['lon'].values[0],
  202. mode='transit')):
  203. k = k + 1
  204. shapes.append(
  205. Shape(shape_id=row_coord['stn_line'],
  206. shape_pt_lat=coord_case_spc1[0],
  207. shape_pt_lon=coord_case_spc1[1],
  208. shape_pt_sequence=k))
  209. if 'SW' == coord.iloc[i]['stn_line'] or 'SE' == coord.iloc[i]['stn_line']:
  210. for m, coord_case_spc1 in enumerate(
  211. get_route(
  212. coord.iloc[i]['lat'] + ',' + coord.iloc[i]['lon'],
  213. coord.loc[coord['stn_line'] == 'STC']['lat'].values[0] + ',' +
  214. coord.loc[coord['stn_line'] == 'STC']['lon'].values[0],
  215. mode='transit')):
  216. k = k + 1
  217. shapes.append(
  218. Shape(shape_id=row_coord['stn_line'],
  219. shape_pt_lat=coord_case_spc1[0],
  220. shape_pt_lon=coord_case_spc1[1],
  221. shape_pt_sequence=k))
  222. k = 0
  223.  
  224. # Caculate mrt
  225. data = json.load(open(SINGAPORE_MRT_INPUT + "/mrt.json", encoding="utf8"))
  226. old_id = None
  227. for search in data:
  228. if old_id == None or old_id != search['name']: k = 1
  229. prefix = " " + search['type'] + ", Singapore"
  230. for coord in get_route(search['from'] + prefix, search['to'] + prefix, mode='transit'):
  231. shapes.append(
  232. Shape(shape_id=search['name'], shape_pt_lat=coord[0], shape_pt_lon=coord[1], shape_pt_sequence=k))
  233. k += 1
  234. old_id = search['name']
  235.  
  236. return shapes
  237.  
  238. def get_stop_value(self, response):
  239.  
  240. routes = response.meta['routes']
  241. stops = response.meta['stops']
  242. trips = response.meta['trips']
  243.  
  244. num_route_stops = sum(len(value) for key, value in LRT_LIST_STOPS.items()) + sum(value for key, value in MRT_NUM_STOPS.items()) *2
  245.  
  246. stop_value_dict = {}
  247. scraped_data = {'trips': trips, 'num_route_stops': num_route_stops, 'route_stop_dict': {}}
  248.  
  249. stop_time_data = {'stop_times': scraped_data}
  250.  
  251. all_stop_with_value = response.xpath(
  252. '//*[@id="Content-eservice"]/article/section/article/section/form/dl[1]/dd/select/option[position()>1]')
  253. for stop_value in all_stop_with_value:
  254. stop_value_dict[stop_value.xpath('.//text()').extract_first()] = stop_value.xpath(
  255. './/@value').extract_first()
  256.  
  257. def pairwise(iterable):
  258. a, b = itertools.tee(iterable)
  259. next(b, None)
  260. return zip(a, b)
  261.  
  262. def get_sort_key(stop_id, route_id):
  263. primary_stop_id = [stop_id for stop_id in stop_id.split("/") if route_id.split("_")[0] in stop_id][0]
  264.  
  265. return int(primary_stop_id.replace(route_id.split("_")[0], ''))
  266.  
  267. for route in routes:
  268. trips_stop = list()
  269. if route.route_id in list(LRT_LIST_STOPS.keys()):
  270. for stop_id in LRT_LIST_STOPS[route.route_id]:
  271. stop = [stop for stop in stops if stop.stop_id == stop_id][0]
  272. trips_stop.append(stop)
  273. else:
  274. trips_stop = [stop for stop in stops if route.route_id.split("_")[0] in stop.stop_id]
  275. trips_stop = sorted(trips_stop, key=lambda x: get_sort_key(x.stop_id, route.route_id), reverse=False)
  276.  
  277. scraped_data['route_stop_dict'][route.route_id] = list()
  278. first_stop = copy.deepcopy(trips_stop[0])
  279. scraped_data['route_stop_dict'][route.route_id].append(first_stop)
  280.  
  281. for x, y in pairwise(trips_stop):
  282. start_station_code = stop_value_dict[x.stop_desc]
  283. end_station_code = stop_value_dict[y.stop_desc]
  284. yield FormRequest("https://www.transitlink.com.sg/eservice/eguide/rail_info.php",
  285. formdata={
  286. 'mrtcode_start': start_station_code,
  287. 'mrtcode_end': end_station_code
  288. },
  289. callback=self.create_stop_times,
  290. dont_filter=True,
  291. meta={'end_stop': y, 'route_id': route.route_id, 'stop_time_data': stop_time_data}
  292. )
  293.  
  294. def create_stop_times(self, response):
  295. travel_time = response.xpath(
  296. '//*[@id="Content-eservice"]/article/section/table[2]/tr[2]/td[4]/text()').extract_first()
  297.  
  298. stop_time_data = response.meta['stop_time_data']
  299. route_id = response.meta['route_id']
  300. route_stops = stop_time_data['stop_times']['route_stop_dict'][route_id]
  301.  
  302. end_stop = response.meta['end_stop']
  303. end_stop.time_calculate = travel_time
  304.  
  305. route_stops.append(end_stop)
  306.  
  307. return stop_time_data
  308.  
  309. def crawl_stop_times(self, routes, stops, trips):
  310. return Request("https://www.transitlink.com.sg/eservice/eguide/rail_idx.php", callback=self.get_stop_value,
  311. meta={'routes': routes, 'stops': stops, 'trips': trips})
  312.  
  313.  
  314. def time_to_second(self, time):
  315. return float(time[:2]) * 3600 + float(time[2:]) * 60 if (float(time[:2]) > 2) else (float(
  316. time[:2]) + 24) * 3600 + float(time[2:]) * 60
  317.  
  318. def second_to_time(self, sc):
  319. hours = sc // (60 * 60)
  320. sc %= (60 * 60)
  321. minutes = sc // 60
  322. sc %= 60
  323. return "%02i:%02i:%02i" % (hours, minutes, sc)
  324.  
  325. def trip_start_time_generate(self, start_time, end_time, peak_period, peak, off_peak):
  326. start_time_list = []
  327. et = self.time_to_second(end_time)
  328. cr_time = self.time_to_second(start_time)
  329. if (peak_period is not None and peak is not None):
  330. for i in range(0, len(peak_period)):
  331. if i & 1:
  332. start_time_list += [cr_time + peak * 60 * step for step in
  333. range(math.ceil((self.time_to_second(peak_period[i]) - cr_time) / (peak * 60)))]
  334. cr_time = start_time_list[-1] + peak * 60
  335. else:
  336. start_time_list += [cr_time + off_peak * 60 * step for step in range(
  337. math.ceil((self.time_to_second(peak_period[i]) - cr_time) / (off_peak * 60)))]
  338. cr_time = start_time_list[-1] + off_peak * 60
  339. start_time_list += [cr_time + off_peak * 60 * step for step in
  340. range(math.ceil((et - cr_time) / (off_peak * 60)))]
  341. return start_time_list
  342.  
  343. def trip_process(self, timeTables):
  344. trip_list = []
  345. for tita in timeTables:
  346. wd_start_time_list = self.trip_start_time_generate(tita.wd_start_time, tita.wd_end_time,
  347. tita.wd_peak_period, tita.wd_peak, tita.wd_off_peak)
  348. for wd_index in range(len(wd_start_time_list)):
  349. trip_list.append(Trip(tita.route_id, tita.route_id + '_Weekday',
  350. tita.route_id + '_' + str(wd_index) + '_Weekday', tita.trip_headsign,
  351. tita.route_id[:2], tita.route_id[:2],
  352. self.second_to_time(wd_start_time_list[wd_index])))
  353.  
  354. sat_start_time_list = self.trip_start_time_generate(tita.sat_start_time, tita.sat_end_time,
  355. tita.sat_peak_period, tita.sat_peak, tita.sat_off_peak)
  356. for sat_index in range(len(sat_start_time_list)):
  357. trip_list.append(Trip(tita.route_id, tita.route_id + '_Saturday',
  358. tita.route_id + '_' + str(sat_index) + '_Saturday', tita.trip_headsign,
  359. tita.route_id[:2], tita.route_id[:2],
  360. self.second_to_time(sat_start_time_list[sat_index])))
  361.  
  362. sun_start_time_list = self.trip_start_time_generate(tita.sun_start_time, tita.sun_end_time, None, None,
  363. tita.sun_off_peak)
  364. for sun_index in range(len(sun_start_time_list)):
  365. trip_list.append(
  366. Trip(tita.route_id, tita.route_id + '_Sunday', tita.route_id + '_' + str(sun_index) + '_Sunday',
  367. tita.trip_headsign, tita.route_id[:2], tita.route_id[:2],
  368. self.second_to_time(sun_start_time_list[sun_index])))
  369. return trip_list
  370.  
  371. def close(self, reason):
  372. self.logger.info("Finish processing GTFS Singapore...")
  373. shutil.make_archive(SINGAPORE_MRT_OUTPUT, 'zip', SINGAPORE_MRT_OUTPUT)
  374. for f in glob.glob(SINGAPORE_MRT_OUTPUT + "/*.txt"):
  375. os.remove(f)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement