Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- import copy
- import glob
- import itertools
- import math
- import shutil
- import pandas as pd
- import scrapy
- from bs4 import BeautifulSoup
- from scrapy.http import FormRequest
- from scrapy.http import Request
- from definitions import *
- from util.google_maps import get_route
- from .helper import *
- from .models import *
- from .properties import *
- import json
- class SingaporeTransitFeedSpider(scrapy.Spider):
- name = 'singapore-transit-feed'
- allowed_domains = ['transitlink.com.sg']
- start_urls = ['https://www.transitlink.com.sg/eservice/eguide/rail_idx.php']
- custom_settings = {
- 'ITEM_PIPELINES': {
- 'spider.pipelines.SingaporeGTFSPipeline': 300
- }
- }
- def parse(self, response):
- self.logger.info("Start processing GTFS Singapore...")
- agencies = self.crawl_agencies()
- yield {'agencies': agencies}
- routes = self.crawl_routes()
- yield {'routes': routes}
- calendars = self.crawl_calendars()
- yield {'calendars': calendars}
- trips = self.crawl_trips()
- yield {'trips': trips}
- stops = self.crawl_stops(response)
- yield {'stops': stops}
- shapes = self.caculate_shape(response)
- yield {'shapes': shapes}
- calendar_dates = self.crawl_calendar_dates(calendars)
- yield {'calendar_dates': calendar_dates}
- yield self.crawl_stop_times(routes, stops, trips)
- def crawl_agencies(self):
- agencies = list()
- agencies.append(Agency("MRT", "Mass Rapid Transit", "https://mrt.sg/", "Asia/Singapore", "EN"))
- agencies.append(Agency("LRT", "Light Rail Transit", "https://mrt.sg/", "Asia/Singapore", "EN"))
- return agencies
- def crawl_routes(self):
- routes = list()
- for key, value in SINGAPORE_MRT.items():
- routes.append(Route('%s_0' % key, 'MRT', key, '%s 0' % value, 1))
- routes.append(Route('%s_1' % key, 'MRT', key, '%s 1' % value, 1))
- for key, value in SINGAPORE_MRT_NO_LOOP.items():
- routes.append(Route('%s' % key, 'MRT', key, value, 1))
- for key, value in SINGAPORE_LRT.items():
- routes.append(Route(key, 'LRT', key, value, 1))
- return routes
- def crawl_calendars(self):
- calendar_list = []
- calendar_list += [Calendar(
- (CALENDAR_PROPERTIES['mrt_service_prefix'] + CALENDAR_PROPERTIES['lrt_service_prefix'])[index] + '_' +
- CALENDAR_PROPERTIES['calendar_suffix'][calendar_index],
- 1 if CALENDAR_PROPERTIES['calendar_suffix'][calendar_index] == 'Weekday' else 0,
- 1 if CALENDAR_PROPERTIES['calendar_suffix'][calendar_index] == 'Weekday' else 0,
- 1 if CALENDAR_PROPERTIES['calendar_suffix'][calendar_index] == 'Weekday' else 0,
- 1 if CALENDAR_PROPERTIES['calendar_suffix'][calendar_index] == 'Weekday' else 0,
- 1 if CALENDAR_PROPERTIES['calendar_suffix'][calendar_index] == 'Weekday' else 0,
- 1 if CALENDAR_PROPERTIES['calendar_suffix'][calendar_index] == 'Saturday' else 0,
- 1 if CALENDAR_PROPERTIES['calendar_suffix'][calendar_index] == 'Sunday' else 0,
- CALENDAR_PROPERTIES['calendar_start_date'],
- CALENDAR_PROPERTIES['calendar_end_date'])
- for index in
- (range(len(CALENDAR_PROPERTIES['mrt_service_prefix'] + CALENDAR_PROPERTIES['lrt_service_prefix'])))
- for calendar_index in (range(len(CALENDAR_PROPERTIES['calendar_suffix'])))]
- return calendar_list
- def crawl_trips(self):
- station_defines = StopCodeCrawler(
- 'https://www.mytransport.sg/content/mytransport/home/commuting/trainservices.html?myRad=8#Train_Timetable').get_stop_code_from_time_table()
- for trip_headsign in MRT_TRIP_HEADSIGN_DEFINES.values():
- for station in station_defines:
- if (trip_headsign.split(" - ")[0] == station.stop_name):
- arr_station_code = station.stop_code
- if (trip_headsign.split(" - ")[1] == station.stop_name):
- dep_station_code = station.stop_code
- route_id = list(MRT_TRIP_HEADSIGN_DEFINES.keys())[
- list(MRT_TRIP_HEADSIGN_DEFINES.values()).index(trip_headsign)]
- timeTableCrawler = TimeTableCrawler(
- 'https://www.mytransport.sg/content/mytransport/home/commuting/trainservices/jcr:content/par3/mrt_info.ajax.getTrainInformation.%s.%s.%s.%s.html' %
- (arr_station_code,
- dep_station_code,
- route_id[:2],
- trip_headsign.replace(' - ', '.')))
- timeTables = timeTableCrawler.crawl(route_id, trip_headsign)
- trips = self.trip_process(timeTables + LRT_TIMETABLE_DEFINE)
- return trips
- def crawl_calendar_dates(self, calendars):
- calendar_dates = list()
- for calendar in calendars:
- for day in SINGAPORE_HOLIDAYS:
- calendar_dates.append(CalendarDate(calendar.service_id, day, 1))
- return calendar_dates
- def crawl_stops(self, response):
- stations = response.css('select[name="mrtcode_start"]').extract()[0]
- soup = BeautifulSoup(stations, 'html')
- L = list()
- coord = pd.read_csv(SINGAPORE_MRT_INPUT + "/sg_mrt.csv", encoding='utf-8',
- dtype={'stn_no': str, 'stn_line': str, 'lat': str, 'lon': str}, sep=';').fillna('')
- for i, row_coord in coord.iterrows():
- for option in soup.find_all('option')[1:]:
- code = row_coord['stn_line'] + row_coord['stn_no']
- if code in option.text.rsplit(' ', 1)[1][1:-1].split('/'):
- L.append(Stop(stop_id=option.text.rsplit(' ', 1)[1][1:-1], stop_name=option.text.rsplit(' ', 1)[0],
- stop_lat=row_coord['lat'], stop_lon=row_coord['lon'],
- stop_desc=option.text))
- break
- df = pd.DataFrame().from_records([obj.__dict__ for obj in L]).groupby(['stop_name']).first().reset_index()
- stops = []
- for i, row in df.iterrows():
- stops.append(Stop(stop_id=row['stop_id'], stop_name=row['stop_name'],
- stop_lat=row['stop_lat'], stop_lon=row['stop_lon'], stop_desc=row['stop_desc']))
- return stops
- def caculate_shape(self, response):
- coord = pd.read_csv(SINGAPORE_MRT_INPUT + "/sg_mrt.csv", encoding='utf-8', dtype={'stn_no': str, 'stn_line':
- str, 'lat': str, 'lon': str}, sep=';').fillna('')
- k = 0
- shapes = list()
- # Caculate lrt
- for i, row_coord in coord.iterrows():
- if (i == len(coord) - 1) or not (row_coord['stn_line'] in ["PW","PE","SW","SE"]): break
- if coord.iloc[i]['stn_line'] == coord.iloc[i + 1]['stn_line']:
- if ('PW' == coord.iloc[i]['stn_line'] or 'PE' == coord.iloc[i]['stn_line']) and row_coord[
- 'stn_no'] == '1':
- for m, coord_case_spc1 in enumerate(
- get_route(coord.loc[coord['stn_line'] == 'PTC']['lat'].values[0] + ',' +
- coord.loc[coord['stn_line'] == 'PTC']['lon'].values[0],
- coord.iloc[i]['lat'] + ',' + coord.iloc[i]['lon'],
- mode='transit')):
- k = k + 1
- shapes.append(
- Shape(shape_id=row_coord['stn_line'],
- shape_pt_lat=coord_case_spc1[0],
- shape_pt_lon=coord_case_spc1[1],
- shape_pt_sequence=k))
- if ('SW' == coord.iloc[i]['stn_line'] or 'SE' == coord.iloc[i]['stn_line']) and row_coord[
- 'stn_no'] == '1':
- for m, coord_case_spc1 in enumerate(
- get_route(coord.loc[coord['stn_line'] == 'STC']['lat'].values[0] + ',' +
- coord.loc[coord['stn_line'] == 'STC']['lon'].values[0],
- coord.iloc[i]['lat'] + ',' + coord.iloc[i]['lon'],
- mode='transit')):
- k = k + 1
- shapes.append(
- Shape(shape_id=row_coord['stn_line'],
- shape_pt_lat=coord_case_spc1[0],
- shape_pt_lon=coord_case_spc1[1],
- shape_pt_sequence=k))
- for l, coord_sp in enumerate(
- get_route(coord.iloc[i]['lat'] + ',' + coord.iloc[i]['lon'],
- coord.iloc[i + 1]['lat'] + ',' + coord.iloc[i + 1]['lon'],
- mode='transit')):
- k = k + 1
- shapes.append(
- Shape(shape_id=row_coord['stn_line'],
- shape_pt_lat=coord_sp[0],
- shape_pt_lon=coord_sp[1],
- shape_pt_sequence=k))
- else:
- if 'PW' == coord.iloc[i]['stn_line'] or 'PE' == coord.iloc[i]['stn_line']:
- for m, coord_case_spc1 in enumerate(
- get_route(coord.iloc[i]['lat'] + ',' + coord.iloc[i]['lon'],
- coord.loc[coord['stn_line'] == 'PTC']['lat'].values[0] + ',' +
- coord.loc[coord['stn_line'] == 'PTC']['lon'].values[0],
- mode='transit')):
- k = k + 1
- shapes.append(
- Shape(shape_id=row_coord['stn_line'],
- shape_pt_lat=coord_case_spc1[0],
- shape_pt_lon=coord_case_spc1[1],
- shape_pt_sequence=k))
- if 'SW' == coord.iloc[i]['stn_line'] or 'SE' == coord.iloc[i]['stn_line']:
- for m, coord_case_spc1 in enumerate(
- get_route(
- coord.iloc[i]['lat'] + ',' + coord.iloc[i]['lon'],
- coord.loc[coord['stn_line'] == 'STC']['lat'].values[0] + ',' +
- coord.loc[coord['stn_line'] == 'STC']['lon'].values[0],
- mode='transit')):
- k = k + 1
- shapes.append(
- Shape(shape_id=row_coord['stn_line'],
- shape_pt_lat=coord_case_spc1[0],
- shape_pt_lon=coord_case_spc1[1],
- shape_pt_sequence=k))
- k = 0
- # Caculate mrt
- data = json.load(open(SINGAPORE_MRT_INPUT + "/mrt.json", encoding="utf8"))
- old_id = None
- for search in data:
- if old_id == None or old_id != search['name']: k = 1
- prefix = " " + search['type'] + ", Singapore"
- for coord in get_route(search['from'] + prefix, search['to'] + prefix, mode='transit'):
- shapes.append(
- Shape(shape_id=search['name'], shape_pt_lat=coord[0], shape_pt_lon=coord[1], shape_pt_sequence=k))
- k += 1
- old_id = search['name']
- return shapes
- def get_stop_value(self, response):
- routes = response.meta['routes']
- stops = response.meta['stops']
- trips = response.meta['trips']
- num_route_stops = sum(len(value) for key, value in LRT_LIST_STOPS.items()) + sum(value for key, value in MRT_NUM_STOPS.items()) *2
- stop_value_dict = {}
- scraped_data = {'trips': trips, 'num_route_stops': num_route_stops, 'route_stop_dict': {}}
- stop_time_data = {'stop_times': scraped_data}
- all_stop_with_value = response.xpath(
- '//*[@id="Content-eservice"]/article/section/article/section/form/dl[1]/dd/select/option[position()>1]')
- for stop_value in all_stop_with_value:
- stop_value_dict[stop_value.xpath('.//text()').extract_first()] = stop_value.xpath(
- './/@value').extract_first()
- def pairwise(iterable):
- a, b = itertools.tee(iterable)
- next(b, None)
- return zip(a, b)
- def get_sort_key(stop_id, route_id):
- primary_stop_id = [stop_id for stop_id in stop_id.split("/") if route_id.split("_")[0] in stop_id][0]
- return int(primary_stop_id.replace(route_id.split("_")[0], ''))
- for route in routes:
- trips_stop = list()
- if route.route_id in list(LRT_LIST_STOPS.keys()):
- for stop_id in LRT_LIST_STOPS[route.route_id]:
- stop = [stop for stop in stops if stop.stop_id == stop_id][0]
- trips_stop.append(stop)
- else:
- trips_stop = [stop for stop in stops if route.route_id.split("_")[0] in stop.stop_id]
- trips_stop = sorted(trips_stop, key=lambda x: get_sort_key(x.stop_id, route.route_id), reverse=False)
- scraped_data['route_stop_dict'][route.route_id] = list()
- first_stop = copy.deepcopy(trips_stop[0])
- scraped_data['route_stop_dict'][route.route_id].append(first_stop)
- for x, y in pairwise(trips_stop):
- start_station_code = stop_value_dict[x.stop_desc]
- end_station_code = stop_value_dict[y.stop_desc]
- yield FormRequest("https://www.transitlink.com.sg/eservice/eguide/rail_info.php",
- formdata={
- 'mrtcode_start': start_station_code,
- 'mrtcode_end': end_station_code
- },
- callback=self.create_stop_times,
- dont_filter=True,
- meta={'end_stop': y, 'route_id': route.route_id, 'stop_time_data': stop_time_data}
- )
- def create_stop_times(self, response):
- travel_time = response.xpath(
- '//*[@id="Content-eservice"]/article/section/table[2]/tr[2]/td[4]/text()').extract_first()
- stop_time_data = response.meta['stop_time_data']
- route_id = response.meta['route_id']
- route_stops = stop_time_data['stop_times']['route_stop_dict'][route_id]
- end_stop = response.meta['end_stop']
- end_stop.time_calculate = travel_time
- route_stops.append(end_stop)
- return stop_time_data
- def crawl_stop_times(self, routes, stops, trips):
- return Request("https://www.transitlink.com.sg/eservice/eguide/rail_idx.php", callback=self.get_stop_value,
- meta={'routes': routes, 'stops': stops, 'trips': trips})
- def time_to_second(self, time):
- return float(time[:2]) * 3600 + float(time[2:]) * 60 if (float(time[:2]) > 2) else (float(
- time[:2]) + 24) * 3600 + float(time[2:]) * 60
- def second_to_time(self, sc):
- hours = sc // (60 * 60)
- sc %= (60 * 60)
- minutes = sc // 60
- sc %= 60
- return "%02i:%02i:%02i" % (hours, minutes, sc)
- def trip_start_time_generate(self, start_time, end_time, peak_period, peak, off_peak):
- start_time_list = []
- et = self.time_to_second(end_time)
- cr_time = self.time_to_second(start_time)
- if (peak_period is not None and peak is not None):
- for i in range(0, len(peak_period)):
- if i & 1:
- start_time_list += [cr_time + peak * 60 * step for step in
- range(math.ceil((self.time_to_second(peak_period[i]) - cr_time) / (peak * 60)))]
- cr_time = start_time_list[-1] + peak * 60
- else:
- start_time_list += [cr_time + off_peak * 60 * step for step in range(
- math.ceil((self.time_to_second(peak_period[i]) - cr_time) / (off_peak * 60)))]
- cr_time = start_time_list[-1] + off_peak * 60
- start_time_list += [cr_time + off_peak * 60 * step for step in
- range(math.ceil((et - cr_time) / (off_peak * 60)))]
- return start_time_list
- def trip_process(self, timeTables):
- trip_list = []
- for tita in timeTables:
- wd_start_time_list = self.trip_start_time_generate(tita.wd_start_time, tita.wd_end_time,
- tita.wd_peak_period, tita.wd_peak, tita.wd_off_peak)
- for wd_index in range(len(wd_start_time_list)):
- trip_list.append(Trip(tita.route_id, tita.route_id + '_Weekday',
- tita.route_id + '_' + str(wd_index) + '_Weekday', tita.trip_headsign,
- tita.route_id[:2], tita.route_id[:2],
- self.second_to_time(wd_start_time_list[wd_index])))
- sat_start_time_list = self.trip_start_time_generate(tita.sat_start_time, tita.sat_end_time,
- tita.sat_peak_period, tita.sat_peak, tita.sat_off_peak)
- for sat_index in range(len(sat_start_time_list)):
- trip_list.append(Trip(tita.route_id, tita.route_id + '_Saturday',
- tita.route_id + '_' + str(sat_index) + '_Saturday', tita.trip_headsign,
- tita.route_id[:2], tita.route_id[:2],
- self.second_to_time(sat_start_time_list[sat_index])))
- sun_start_time_list = self.trip_start_time_generate(tita.sun_start_time, tita.sun_end_time, None, None,
- tita.sun_off_peak)
- for sun_index in range(len(sun_start_time_list)):
- trip_list.append(
- Trip(tita.route_id, tita.route_id + '_Sunday', tita.route_id + '_' + str(sun_index) + '_Sunday',
- tita.trip_headsign, tita.route_id[:2], tita.route_id[:2],
- self.second_to_time(sun_start_time_list[sun_index])))
- return trip_list
- def close(self, reason):
- self.logger.info("Finish processing GTFS Singapore...")
- shutil.make_archive(SINGAPORE_MRT_OUTPUT, 'zip', SINGAPORE_MRT_OUTPUT)
- for f in glob.glob(SINGAPORE_MRT_OUTPUT + "/*.txt"):
- os.remove(f)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement