Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ###### Read Library #######
- import rakutenRMS_setup # Importing the setup file
- import urllib.request
- import time
- import requests
- import lxml
- import csv
- from datetime import datetime, timedelta
- import time
- import random
- from bs4 import BeautifulSoup
- from selenium import webdriver
- from selenium.webdriver.common.keys import Keys
- from selenium.webdriver.common import action_chains, keys
- from selenium.webdriver.common.by import By
- from selenium.webdriver.support.ui import WebDriverWait
- from selenium.webdriver.support import expected_conditions as EC
- # Getting the set up values from rakutenRMS_setup.py file
- location_Chrome = rakutenRMS_setup.location_Chrome
- url_default = rakutenRMS_setup.url_default
- rLoginID = rakutenRMS_setup.rLoginID
- rLoginPassword = rakutenRMS_setup.rLoginPassword
- username = rakutenRMS_setup.username
- pass_word = rakutenRMS_setup.pass_word
- # Device type to track
- device_nav_code = rakutenRMS_setup.device_nav_code
- # User action to track
- users_dropdown_option = rakutenRMS_setup.users_dropdown_option
- # Output file naming rules
- # dateBlock + "rakutenRMS_data.csv" # user IDs
- # "Tracking_" + dateBlock + "rakutenRMS_data.csv" # tracking info
- # "User + Tracking data_" + dateBlock + ".csv" # Final output after merging 2 CSVs
- yesterday = datetime.now() - timedelta(days=1) #Getting previous day's date for file names
- dateBlock = yesterday.strftime('%Y%m%d')
- # Start Session
- session = requests.session()
- # Login
- login_info = {
- "loginID": rLoginID,
- "loginPassword": rLoginPassword,
- "user_name" : username,
- "password" : pass_word
- }
- def create_session():
- s = requests.Session()
- s.headers.update({
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
- "Accept-Language": "ja,en-US;q=0.7,en;q=0.3",
- "User-Agent":
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:55.0) Gecko/20100101 Firefox/55.0"
- })
- return s
- # url_login = url_default + "/user/login_exec/" # Example
- url_login = url_default
- res = session.post(url_login, data=login_info)
- res.raise_for_status()
- headers = {
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
- "Accept-Language": "ja,en-US;q=0.7,en;q=0.3",
- "Connection": "keep-alive",
- "User-Agent":
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:55.0) Gecko/20100101 Firefox/55.0"
- }
- driver = webdriver.Chrome(location_Chrome)
- action = action_chains.ActionChains(driver)
- # クッキーを先に取得する
- driver.get(url_login)
- # maximize the window
- driver.maximize_window()
- # Step 1 of the login : Login using rms user ID and rms password
- driver.find_element_by_name("login_id").send_keys(rLoginID)
- time.sleep(1)
- driver.find_element_by_name("passwd").send_keys(rLoginPassword)
- time.sleep(1)
- driver.find_element_by_xpath(".//*[contains(text(), '次へ')]").click()
- time.sleep(2)
- print("############################## Step 1: Successful ############################################")
- # driver.save_screenshot(_dir + "/login.png") # ログイン済みを画像で確認できます
- # Step 2 of the login: Login with user ID (email) and password
- driver.find_element_by_name("user_id").send_keys(username)
- time.sleep(1)
- driver.find_element_by_name("user_passwd").send_keys(pass_word)
- time.sleep(1)
- # https://stackoverflow.com/questions/27927964/selenium-element-not-visible-exception
- """
- Hitting tab 5 times to reach the desired element and then hitting enter
- This code block also works with range(5) and F12 key -> Confirm why"""
- for i in range(6):
- # action.send_keys(keys.Keys.F12)
- action.send_keys(keys.Keys.TAB)
- action.perform()
- action.send_keys(keys.Keys.RETURN)
- action.perform()
- #driver.find_element_by_xpath(".//*[contains(text(), 'ログイン')]").send_keys(Keys.RETURN)
- #driver.find_element_by_xpath(".//*[contains(text(), 'ログイン')]").click()
- time.sleep(2)
- # Click the "つぎへ" button
- driver.find_element_by_xpath('//*[@class="rf-button-primary rf-block rf-medium"]').click()
- time.sleep(2)
- # Click the accept compliance policy button
- driver.find_element_by_xpath('//*[@class="btn-reset btn-round btn-red"]').click()
- time.sleep(1)
- # Click on データ分析 and then on アクセス分析
- driver.find_element_by_xpath('//*[@class="rms-nav-txt"]').click()
- time.sleep(2)
- print("Click on データ分析 successful")
- # directly navigate to the page after clicking the アクセス分析
- driver.get("https://mainmenu.rms.rakuten.co.jp/?left_navi=32")
- print("Click on アクセス分析 successful")
- time.sleep(2)
- # click on the 検索キーワード button
- driver.find_element_by_id("mm_sub0303_05").click()
- time.sleep(2)
- print("click on the 検索キーワード button : Successful")
- # idea 1: try tab 16(?) times and hit enter OR hit enter, tab 15 times and then hit enter again
- # idea 2: directly navigate to the page by using the hyperlink
- # https://rdatatool.rms.rakuten.co.jp/access/?menu=pc&evt=RT_P11_02&stat=1&owin=
- driver.get("https://rdatatool.rms.rakuten.co.jp/access/?menu=pc&evt=RT_P11_02&stat=1&owin=")
- print("Click on トラキング link: Successful")
- driver.find_element_by_id(device_nav_code).click()
- print("Click on radio button: Successful")
- # Select a value from the 対象の動作 dropdown
- dropdown_target_action = driver.find_element_by_name("limit")
- for option in dropdown_target_action.find_elements_by_tag_name('option'):
- if option.text == users_dropdown_option: # users_dropdown_option is set at the top of the page
- option.click() # select() in earlier versions of webdriver
- break
- print("Selection of the value from the 対象の動作 dropdown: Successful")
- # Try submit. If not successful then use click
- driver.find_element_by_name("select_day").submit()
- print("Click on データ表示: Successful")
- csvFile = open(dateBlock + "rakutenRMS_data.csv", 'w', newline='', encoding='shift_jis')
- writer = csv.writer(csvFile)
- flag_continue = 1
- all_users_page_index = 1
- # user_infopage_index = 1
- tracking_page_index = 1
- user = ""
- access_count = ""
- user_action = ""
- tracking_urls = []
- # ------------------------------------------------------------------------------
- # Total number of tables in the page: 49
- # Table with the pager links: 38,41
- # Table with the user hyperlinks, access count, user action: 40
- # print("no of tables in the page:", len(table))
- # -------------------------------------------------------------------------------
- # for t in range(len(table)):
- # print("================================== table number: " + str(t) + " ====================================")
- # print(table[t])
- while flag_continue == 1:
- all_users_page_index += 1
- obj_bs = BeautifulSoup(driver.page_source, "lxml")
- try:
- table = obj_bs.findAll("table")[40]
- except Exception:
- print("\n ------- Inside the first except block -------- \n")
- flag_continue = 0
- break
- rows = table.findAll("tr")
- for row in rows:
- flag_data_user = 0
- list_user_info = row.findAll(['td'])
- csvRow = []
- if len(list_user_info) == 0:
- # confirm: put a print statement here to check if code flows here or not
- next
- # Getting hashed user IDs
- for cell_index in list_user_info:
- hashed_user_id = str(cell_index.get_text())
- if len(hashed_user_id) > 32:
- hashed_user_id = hashed_user_id[:32]
- csvRow.append(hashed_user_id)
- # Getting URLs pointing to the user's tracking details
- for cell in list_user_info:
- user_tracking_link = str(cell.find("a"))
- locate_start = user_tracking_link.find('a href="')
- locate_end = user_tracking_link.find('" target="')
- # if locate_start == -1 or locate_end == -1:
- #
- # print("User link not found")
- #
- # else:
- # text_trim = user_tracking_link[locate_start + 8: locate_end]
- # print("text_to_add is now:", text_trim)
- # csvRow.append(text_trim)
- # flag_data_user = 1
- # alternate form of the above code block # De Morgan's Law
- if locate_start != -1 and locate_end != -1:
- text_trim = user_tracking_link[locate_start + 8: locate_end]
- tracking_urls.append(text_trim)
- # print("text_to_add is now:", text_trim)
- flag_data_user = 1
- else:
- font_tag_info = str(cell.find("font"))
- color_start = font_tag_info.find("color=")
- #TODO: Check and delete the else block
- # if color_start != -1:
- # color_code = font_tag_info[color_start + 7:color_start + 7 + 7]
- # else:
- # color_code = "" # Color Info not found, set color_code to an empty string
- color_code = font_tag_info[color_start + 7:color_start + 7 + 7] if color_start != -1 else ""
- text_def = str(cell.get_text())
- text_trim = text_def.encode("shift_jis", errors="ignore").decode("shift_jis", "ignore")
- text_trim = "*" + text_trim if color_code == "#CC0000" else text_trim
- csvRow.append(text_trim)
- if flag_data_user == 1:
- writer.writerow(csvRow)
- try:
- # code to navigate to the next page here
- # example urls
- # https://rdatatool.rms.rakuten.co.jp/access/?menu=pc&evt=RT_P11_02&page=2&owin= # page 2
- # https://rdatatool.rms.rakuten.co.jp/access/?menu=pc&evt=RT_P11_02&page=3&owin= # page 3
- all_users_link_to_check = "https://rdatatool.rms.rakuten.co.jp/access/?menu=" + device_nav_code + "&evt=RT_P11_02&page=" + str(all_users_page_index) + "&owin="
- driver.get(all_users_link_to_check)
- print("Now checking: " + all_users_link_to_check)
- time.sleep(random.randint(1, 3))
- except Exception:
- flag_continue = 0
- print(" \n ----- Inside the second except block -------- \n ")
- break
- print("############### CSV File with user links created ############### ")
- # input("############ Enter a key to scrape user tracking details ############ ")
- csvFile.close()
- # Open new csv file for the main tracking behaviour
- csvFile_tracking = open("Tracking_" + dateBlock + "rakutenRMS_data.csv", 'w', newline='', encoding='shift_jis')
- writer2 = csv.writer(csvFile_tracking)
- main_window = driver.current_window_handle
- for index in range(len(tracking_urls)):
- # for index in range(3): # Checking only the first 3 users for testing
- flag_continue_tracking = 1
- tracking_page_index += 1
- url_to_click = tracking_urls[index].replace("&", "&")
- driver.get(url_to_click)
- print("[User:" + str(index + 1) + "] Now opening this url (Tracking): " + url_to_click)
- driver.switch_to.window(driver.window_handles[-1])
- obj_bs2 = BeautifulSoup(driver.page_source, "lxml")
- while flag_continue_tracking == 1:
- try:
- table = obj_bs2.findAll("table")[17] # Looking into the 17th table for the main tracking details
- except Exception:
- print("\n\n\n Inside the first except block : Tracking\n")
- print("Table with user info not found")
- print("-------------------------------------------------")
- flag_continue_tracking = 0
- break
- outer_tr = table.findAll("tr")[0] # The inner table is contained in the first <tr>
- inner_trs = outer_tr.findAll("tr")
- for row in inner_trs:
- flag_tracking_page = 0 # not used?
- list_user_tracking_info = row.findAll(['td'])
- csvRow2 = []
- if len(list_user_tracking_info) == 0:
- print("-----------------> I am here <---------------------") # Confirm if code flows here
- next
- for cell_tracking_index in range(len(list_user_tracking_info)):
- text_def = str(list_user_tracking_info[cell_tracking_index].get_text())
- text_trim = text_def.encode("shift_jis", errors="ignore").decode("shift_jis", "ignore")
- csvRow2.append(text_trim)
- writer2.writerow(csvRow2)
- try:
- driver.find_element_by_id("page_next1").click() # Clicking the "次の30件>>" link to navigate
- time.sleep(random.randint(1, 3))
- action.send_keys(keys.Keys.F5)
- # driver.implicitly_wait(15)
- # ------ Trying the selenium webdriver wait stmt: Start
- # try:
- # element = WebDriverWait(driver, 10).until(
- # EC.presence_of_element_located((By.ID, "page_next1"))
- # )
- # except Exception:
- # print("==== There was an error ====")
- # input("Press any key to continue:")
- # break
- # ------ Trying the selenium webdriver wait stmt: End
- # action.key_down(Keys.CONTROL).click(driver.find_element_by_id("page_next1")).key_up(Keys.CONTROL).perform()
- # driver.switch_to.window(driver.window_handles[-1])
- #tracking_page_index += 1
- #tracking_link_to_check = "https://rdatatool.rms.rakuten.co.jp/access/?menu=" + device_nav_code + "&evt=RT_P11_01&page=" + str(tracking_page_index) + "&owin="
- #driver.get(tracking_link_to_check)
- # driver.execute_script("window.open('');")
- # time.sleep(3)
- # driver.switch_to.window(driver.window_handles[-2])
- # input("Check active window:") # Check the execution of this line
- # print("Now checking: " + tracking_link_to_check)
- time.sleep(random.randint(1, 3))
- except Exception:
- flag_continue_tracking = 0
- print("\n ----- Inside the second except block : Tracking -------- \n")
- time.sleep(random.randint(1, 3))
- break
- print("\n\n================== All rows printed and saved to the csv ======================\n\n")
- csvFile_tracking.close()
- driver.close()
- driver.quit()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement