Untitled

# -*- coding: utf-8 -*-

import copy
import glob
import itertools
import math
import shutil

import pandas as pd
import scrapy
from bs4 import BeautifulSoup
from scrapy.http import FormRequest
from scrapy.http import Request

from definitions import *
from util.google_maps import get_route
from .helper import *
from .models import *
from .properties import *
import json


class SingaporeTransitFeedSpider(scrapy.Spider):
    name = 'singapore-transit-feed'
    allowed_domains = ['transitlink.com.sg']
    start_urls = ['https://www.transitlink.com.sg/eservice/eguide/rail_idx.php']
    custom_settings = {
        'ITEM_PIPELINES': {
            'spider.pipelines.SingaporeGTFSPipeline': 300
        }
    }

    def parse(self, response):
        self.logger.info("Start processing GTFS Singapore...")

        agencies = self.crawl_agencies()
        yield {'agencies': agencies}

        routes = self.crawl_routes()
        yield {'routes': routes}

        calendars = self.crawl_calendars()
        yield {'calendars': calendars}

        trips = self.crawl_trips()
        yield {'trips': trips}

        stops = self.crawl_stops(response)
        yield {'stops': stops}

        shapes = self.caculate_shape(response)
        yield {'shapes': shapes}

        calendar_dates = self.crawl_calendar_dates(calendars)
        yield {'calendar_dates': calendar_dates}

        yield self.crawl_stop_times(routes, stops, trips)

    def crawl_agencies(self):
        agencies = list()
        agencies.append(Agency("MRT", "Mass Rapid Transit", "https://mrt.sg/", "Asia/Singapore", "EN"))
        agencies.append(Agency("LRT", "Light Rail Transit", "https://mrt.sg/", "Asia/Singapore", "EN"))
        return agencies

    def crawl_routes(self):
        routes = list()
        for key, value in SINGAPORE_MRT.items():
            routes.append(Route('%s_0' % key, 'MRT', key, '%s 0' % value, 1))
            routes.append(Route('%s_1' % key, 'MRT', key, '%s 1' % value, 1))

        for key, value in SINGAPORE_MRT_NO_LOOP.items():
            routes.append(Route('%s' % key, 'MRT', key, value, 1))


        for key, value in SINGAPORE_LRT.items():
            routes.append(Route(key, 'LRT', key, value, 1))
        return routes

    def crawl_calendars(self):
        calendar_list = []
        calendar_list += [Calendar(
            (CALENDAR_PROPERTIES['mrt_service_prefix'] + CALENDAR_PROPERTIES['lrt_service_prefix'])[index] + '_' +
            CALENDAR_PROPERTIES['calendar_suffix'][calendar_index],
            1 if CALENDAR_PROPERTIES['calendar_suffix'][calendar_index] == 'Weekday' else 0,
            1 if CALENDAR_PROPERTIES['calendar_suffix'][calendar_index] == 'Weekday' else 0,
            1 if CALENDAR_PROPERTIES['calendar_suffix'][calendar_index] == 'Weekday' else 0,
            1 if CALENDAR_PROPERTIES['calendar_suffix'][calendar_index] == 'Weekday' else 0,
            1 if CALENDAR_PROPERTIES['calendar_suffix'][calendar_index] == 'Weekday' else 0,
            1 if CALENDAR_PROPERTIES['calendar_suffix'][calendar_index] == 'Saturday' else 0,
            1 if CALENDAR_PROPERTIES['calendar_suffix'][calendar_index] == 'Sunday' else 0,
            CALENDAR_PROPERTIES['calendar_start_date'],
            CALENDAR_PROPERTIES['calendar_end_date'])
            for index in
            (range(len(CALENDAR_PROPERTIES['mrt_service_prefix'] + CALENDAR_PROPERTIES['lrt_service_prefix'])))
            for calendar_index in (range(len(CALENDAR_PROPERTIES['calendar_suffix'])))]
        return calendar_list

    def crawl_trips(self):
        station_defines = StopCodeCrawler(
            'https://www.mytransport.sg/content/mytransport/home/commuting/trainservices.html?myRad=8#Train_Timetable').get_stop_code_from_time_table()
        for trip_headsign in MRT_TRIP_HEADSIGN_DEFINES.values():
            for station in station_defines:
                if (trip_headsign.split(" - ")[0] == station.stop_name):
                    arr_station_code = station.stop_code
                if (trip_headsign.split(" - ")[1] == station.stop_name):
                    dep_station_code = station.stop_code

            route_id = list(MRT_TRIP_HEADSIGN_DEFINES.keys())[
                list(MRT_TRIP_HEADSIGN_DEFINES.values()).index(trip_headsign)]
            timeTableCrawler = TimeTableCrawler(
                'https://www.mytransport.sg/content/mytransport/home/commuting/trainservices/jcr:content/par3/mrt_info.ajax.getTrainInformation.%s.%s.%s.%s.html' %
                (arr_station_code,
                 dep_station_code,
                 route_id[:2],
                 trip_headsign.replace(' - ', '.')))
            timeTables = timeTableCrawler.crawl(route_id, trip_headsign)
        trips = self.trip_process(timeTables + LRT_TIMETABLE_DEFINE)
        return trips

    def crawl_calendar_dates(self, calendars):
        calendar_dates = list()
        for calendar in calendars:
            for day in SINGAPORE_HOLIDAYS:
                calendar_dates.append(CalendarDate(calendar.service_id, day, 1))
        return calendar_dates

    def crawl_stops(self, response):
        stations = response.css('select[name="mrtcode_start"]').extract()[0]
        soup = BeautifulSoup(stations, 'html')
        L = list()
        coord = pd.read_csv(SINGAPORE_MRT_INPUT + "/sg_mrt.csv", encoding='utf-8',
                            dtype={'stn_no': str, 'stn_line': str, 'lat': str, 'lon': str}, sep=';').fillna('')
        for i, row_coord in coord.iterrows():
            for option in soup.find_all('option')[1:]:
                code = row_coord['stn_line'] + row_coord['stn_no']
                if code in option.text.rsplit(' ', 1)[1][1:-1].split('/'):
                    L.append(Stop(stop_id=option.text.rsplit(' ', 1)[1][1:-1], stop_name=option.text.rsplit(' ', 1)[0],
                                  stop_lat=row_coord['lat'], stop_lon=row_coord['lon'],
                                  stop_desc=option.text))
                    break
        df = pd.DataFrame().from_records([obj.__dict__ for obj in L]).groupby(['stop_name']).first().reset_index()
        stops = []
        for i, row in df.iterrows():
            stops.append(Stop(stop_id=row['stop_id'], stop_name=row['stop_name'],
                              stop_lat=row['stop_lat'], stop_lon=row['stop_lon'], stop_desc=row['stop_desc']))
        return stops

    def caculate_shape(self, response):
        coord = pd.read_csv(SINGAPORE_MRT_INPUT + "/sg_mrt.csv", encoding='utf-8', dtype={'stn_no': str, 'stn_line':
            str, 'lat': str, 'lon': str}, sep=';').fillna('')
        k = 0

        shapes = list()
        # Caculate lrt
        for i, row_coord in coord.iterrows():
            if (i == len(coord) - 1) or not (row_coord['stn_line'] in ["PW","PE","SW","SE"]): break
            if coord.iloc[i]['stn_line'] == coord.iloc[i + 1]['stn_line']:
                if ('PW' == coord.iloc[i]['stn_line'] or 'PE' == coord.iloc[i]['stn_line']) and row_coord[
                    'stn_no'] == '1':
                    for m, coord_case_spc1 in enumerate(
                            get_route(coord.loc[coord['stn_line'] == 'PTC']['lat'].values[0] + ',' +
                                      coord.loc[coord['stn_line'] == 'PTC']['lon'].values[0],
                                      coord.iloc[i]['lat'] + ',' + coord.iloc[i]['lon'],
                                      mode='transit')):
                        k = k + 1
                        shapes.append(
                            Shape(shape_id=row_coord['stn_line'],
                                  shape_pt_lat=coord_case_spc1[0],
                                  shape_pt_lon=coord_case_spc1[1],
                                  shape_pt_sequence=k))
                if ('SW' == coord.iloc[i]['stn_line'] or 'SE' == coord.iloc[i]['stn_line']) and row_coord[
                    'stn_no'] == '1':
                    for m, coord_case_spc1 in enumerate(
                            get_route(coord.loc[coord['stn_line'] == 'STC']['lat'].values[0] + ',' +
                                      coord.loc[coord['stn_line'] == 'STC']['lon'].values[0],
                                      coord.iloc[i]['lat'] + ',' + coord.iloc[i]['lon'],
                                      mode='transit')):
                        k = k + 1
                        shapes.append(
                            Shape(shape_id=row_coord['stn_line'],
                                  shape_pt_lat=coord_case_spc1[0],
                                  shape_pt_lon=coord_case_spc1[1],
                                  shape_pt_sequence=k))

                for l, coord_sp in enumerate(
                        get_route(coord.iloc[i]['lat'] + ',' + coord.iloc[i]['lon'],
                                  coord.iloc[i + 1]['lat'] + ',' + coord.iloc[i + 1]['lon'],
                                  mode='transit')):
                    k = k + 1
                    shapes.append(
                        Shape(shape_id=row_coord['stn_line'],
                              shape_pt_lat=coord_sp[0],
                              shape_pt_lon=coord_sp[1],
                              shape_pt_sequence=k))

            else:
                if 'PW' == coord.iloc[i]['stn_line'] or 'PE' == coord.iloc[i]['stn_line']:
                    for m, coord_case_spc1 in enumerate(
                            get_route(coord.iloc[i]['lat'] + ',' + coord.iloc[i]['lon'],
                                      coord.loc[coord['stn_line'] == 'PTC']['lat'].values[0] + ',' +
                                      coord.loc[coord['stn_line'] == 'PTC']['lon'].values[0],
                                      mode='transit')):
                        k = k + 1
                        shapes.append(
                            Shape(shape_id=row_coord['stn_line'],
                                  shape_pt_lat=coord_case_spc1[0],
                                  shape_pt_lon=coord_case_spc1[1],
                                  shape_pt_sequence=k))
                if 'SW' == coord.iloc[i]['stn_line'] or 'SE' == coord.iloc[i]['stn_line']:
                    for m, coord_case_spc1 in enumerate(
                            get_route(
                                coord.iloc[i]['lat'] + ',' + coord.iloc[i]['lon'],
                                coord.loc[coord['stn_line'] == 'STC']['lat'].values[0] + ',' +
                                coord.loc[coord['stn_line'] == 'STC']['lon'].values[0],
                                mode='transit')):
                        k = k + 1
                        shapes.append(
                            Shape(shape_id=row_coord['stn_line'],
                                  shape_pt_lat=coord_case_spc1[0],
                                  shape_pt_lon=coord_case_spc1[1],
                                  shape_pt_sequence=k))
                k = 0

        # Caculate mrt
        data = json.load(open(SINGAPORE_MRT_INPUT + "/mrt.json", encoding="utf8"))
        old_id = None
        for search in data:
            if old_id == None or old_id != search['name']: k = 1
            prefix = " " + search['type'] + ", Singapore"
            for coord in get_route(search['from'] + prefix, search['to'] + prefix, mode='transit'):
                shapes.append(
                    Shape(shape_id=search['name'], shape_pt_lat=coord[0], shape_pt_lon=coord[1], shape_pt_sequence=k))
                k += 1
            old_id = search['name']

        return shapes

    def get_stop_value(self, response):

        routes = response.meta['routes']
        stops = response.meta['stops']
        trips = response.meta['trips']

        num_route_stops = sum(len(value) for key, value in LRT_LIST_STOPS.items()) + sum(value for key, value in MRT_NUM_STOPS.items()) *2

        stop_value_dict = {}
        scraped_data = {'trips': trips, 'num_route_stops': num_route_stops, 'route_stop_dict': {}}

        stop_time_data = {'stop_times': scraped_data}

        all_stop_with_value = response.xpath(
            '//*[@id="Content-eservice"]/article/section/article/section/form/dl[1]/dd/select/option[position()>1]')
        for stop_value in all_stop_with_value:
            stop_value_dict[stop_value.xpath('.//text()').extract_first()] = stop_value.xpath(
                './/@value').extract_first()

        def pairwise(iterable):
            a, b = itertools.tee(iterable)
            next(b, None)
            return zip(a, b)

        def get_sort_key(stop_id, route_id):
            primary_stop_id = [stop_id for stop_id in stop_id.split("/") if route_id.split("_")[0] in stop_id][0]

            return int(primary_stop_id.replace(route_id.split("_")[0], ''))

        for route in routes:
            trips_stop = list()
            if route.route_id in list(LRT_LIST_STOPS.keys()):
                for stop_id in LRT_LIST_STOPS[route.route_id]:
                    stop = [stop for stop in stops if stop.stop_id == stop_id][0]
                    trips_stop.append(stop)
            else:
                trips_stop = [stop for stop in stops if route.route_id.split("_")[0] in stop.stop_id]
                trips_stop = sorted(trips_stop, key=lambda x: get_sort_key(x.stop_id, route.route_id), reverse=False)

            scraped_data['route_stop_dict'][route.route_id] = list()
            first_stop = copy.deepcopy(trips_stop[0])
            scraped_data['route_stop_dict'][route.route_id].append(first_stop)

            for x, y in pairwise(trips_stop):
                start_station_code = stop_value_dict[x.stop_desc]
                end_station_code = stop_value_dict[y.stop_desc]
                yield FormRequest("https://www.transitlink.com.sg/eservice/eguide/rail_info.php",
                                  formdata={
                                      'mrtcode_start': start_station_code,
                                      'mrtcode_end': end_station_code
                                  },
                                  callback=self.create_stop_times,
                                  dont_filter=True,
                                  meta={'end_stop': y, 'route_id': route.route_id, 'stop_time_data': stop_time_data}
                                  )

    def create_stop_times(self, response):
        travel_time = response.xpath(
            '//*[@id="Content-eservice"]/article/section/table[2]/tr[2]/td[4]/text()').extract_first()

        stop_time_data = response.meta['stop_time_data']
        route_id = response.meta['route_id']
        route_stops = stop_time_data['stop_times']['route_stop_dict'][route_id]

        end_stop = response.meta['end_stop']
        end_stop.time_calculate = travel_time

        route_stops.append(end_stop)

        return stop_time_data

    def crawl_stop_times(self, routes, stops, trips):
        return Request("https://www.transitlink.com.sg/eservice/eguide/rail_idx.php", callback=self.get_stop_value,
                       meta={'routes': routes, 'stops': stops, 'trips': trips})


    def time_to_second(self, time):
        return float(time[:2]) * 3600 + float(time[2:]) * 60 if (float(time[:2]) > 2) else (float(
            time[:2]) + 24) * 3600 + float(time[2:]) * 60

    def second_to_time(self, sc):
        hours = sc // (60 * 60)
        sc %= (60 * 60)
        minutes = sc // 60
        sc %= 60
        return "%02i:%02i:%02i" % (hours, minutes, sc)

    def trip_start_time_generate(self, start_time, end_time, peak_period, peak, off_peak):
        start_time_list = []
        et = self.time_to_second(end_time)
        cr_time = self.time_to_second(start_time)
        if (peak_period is not None and peak is not None):
            for i in range(0, len(peak_period)):
                if i & 1:
                    start_time_list += [cr_time + peak * 60 * step for step in
                                        range(math.ceil((self.time_to_second(peak_period[i]) - cr_time) / (peak * 60)))]
                    cr_time = start_time_list[-1] + peak * 60
                else:
                    start_time_list += [cr_time + off_peak * 60 * step for step in range(
                        math.ceil((self.time_to_second(peak_period[i]) - cr_time) / (off_peak * 60)))]
                    cr_time = start_time_list[-1] + off_peak * 60
        start_time_list += [cr_time + off_peak * 60 * step for step in
                            range(math.ceil((et - cr_time) / (off_peak * 60)))]
        return start_time_list

    def trip_process(self, timeTables):
        trip_list = []
        for tita in timeTables:
            wd_start_time_list = self.trip_start_time_generate(tita.wd_start_time, tita.wd_end_time,
                                                               tita.wd_peak_period, tita.wd_peak, tita.wd_off_peak)
            for wd_index in range(len(wd_start_time_list)):
                trip_list.append(Trip(tita.route_id, tita.route_id + '_Weekday',
                                      tita.route_id + '_' + str(wd_index) + '_Weekday', tita.trip_headsign,
                                      tita.route_id[:2], tita.route_id[:2],
                                      self.second_to_time(wd_start_time_list[wd_index])))

            sat_start_time_list = self.trip_start_time_generate(tita.sat_start_time, tita.sat_end_time,
                                                                tita.sat_peak_period, tita.sat_peak, tita.sat_off_peak)
            for sat_index in range(len(sat_start_time_list)):
                trip_list.append(Trip(tita.route_id, tita.route_id + '_Saturday',
                                      tita.route_id + '_' + str(sat_index) + '_Saturday', tita.trip_headsign,
                                      tita.route_id[:2], tita.route_id[:2],
                                      self.second_to_time(sat_start_time_list[sat_index])))

            sun_start_time_list = self.trip_start_time_generate(tita.sun_start_time, tita.sun_end_time, None, None,
                                                                tita.sun_off_peak)
            for sun_index in range(len(sun_start_time_list)):
                trip_list.append(
                    Trip(tita.route_id, tita.route_id + '_Sunday', tita.route_id + '_' + str(sun_index) + '_Sunday',
                         tita.trip_headsign, tita.route_id[:2], tita.route_id[:2],
                         self.second_to_time(sun_start_time_list[sun_index])))
        return trip_list

    def close(self, reason):
        self.logger.info("Finish processing GTFS Singapore...")
        shutil.make_archive(SINGAPORE_MRT_OUTPUT, 'zip', SINGAPORE_MRT_OUTPUT)
        for f in glob.glob(SINGAPORE_MRT_OUTPUT + "/*.txt"):
            os.remove(f)