Advertisement
Guest User

Untitled

a guest
Nov 5th, 2018
110
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 14.47 KB | None | 0 0
  1.  
  2.  
  3.  
  4. ###### Read Library #######
  5.  
  6. import rakutenRMS_setup # Importing the setup file
  7.  
  8. import urllib.request
  9. import time
  10. import requests
  11. import lxml
  12. import csv
  13. from datetime import datetime, timedelta
  14.  
  15. import time
  16. import random
  17.  
  18. from bs4 import BeautifulSoup
  19.  
  20. from selenium import webdriver
  21. from selenium.webdriver.common.keys import Keys
  22. from selenium.webdriver.common import action_chains, keys
  23. from selenium.webdriver.common.by import By
  24. from selenium.webdriver.support.ui import WebDriverWait
  25. from selenium.webdriver.support import expected_conditions as EC
  26.  
  27. # Getting the set up values from rakutenRMS_setup.py file
  28. location_Chrome = rakutenRMS_setup.location_Chrome
  29.  
  30.  
  31. url_default = rakutenRMS_setup.url_default
  32. rLoginID = rakutenRMS_setup.rLoginID
  33. rLoginPassword = rakutenRMS_setup.rLoginPassword
  34. username = rakutenRMS_setup.username
  35. pass_word = rakutenRMS_setup.pass_word
  36.  
  37. # Device type to track
  38. device_nav_code = rakutenRMS_setup.device_nav_code
  39.  
  40. # User action to track
  41. users_dropdown_option = rakutenRMS_setup.users_dropdown_option
  42.  
  43. # Output file naming rules
  44. # dateBlock + "rakutenRMS_data.csv" # user IDs
  45. # "Tracking_" + dateBlock + "rakutenRMS_data.csv" # tracking info
  46. # "User + Tracking data_" + dateBlock + ".csv" # Final output after merging 2 CSVs
  47.  
  48. yesterday = datetime.now() - timedelta(days=1) #Getting previous day's date for file names
  49. dateBlock = yesterday.strftime('%Y%m%d')
  50.  
  51.  
  52. # Start Session
  53. session = requests.session()
  54.  
  55. # Login
  56. login_info = {
  57. "loginID": rLoginID,
  58. "loginPassword": rLoginPassword,
  59. "user_name" : username,
  60. "password" : pass_word
  61. }
  62.  
  63.  
  64. def create_session():
  65. s = requests.Session()
  66. s.headers.update({
  67. "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
  68. "Accept-Language": "ja,en-US;q=0.7,en;q=0.3",
  69. "User-Agent":
  70. "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:55.0) Gecko/20100101 Firefox/55.0"
  71. })
  72. return s
  73.  
  74.  
  75. # url_login = url_default + "/user/login_exec/" # Example
  76. url_login = url_default
  77. res = session.post(url_login, data=login_info)
  78. res.raise_for_status()
  79.  
  80. headers = {
  81. "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
  82. "Accept-Language": "ja,en-US;q=0.7,en;q=0.3",
  83. "Connection": "keep-alive",
  84. "User-Agent":
  85. "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:55.0) Gecko/20100101 Firefox/55.0"
  86. }
  87.  
  88. driver = webdriver.Chrome(location_Chrome)
  89. action = action_chains.ActionChains(driver)
  90.  
  91.  
  92. # クッキーを先に取得する
  93. driver.get(url_login)
  94.  
  95. # maximize the window
  96. driver.maximize_window()
  97.  
  98. # Step 1 of the login : Login using rms user ID and rms password
  99. driver.find_element_by_name("login_id").send_keys(rLoginID)
  100. time.sleep(1)
  101.  
  102. driver.find_element_by_name("passwd").send_keys(rLoginPassword)
  103. time.sleep(1)
  104.  
  105.  
  106. driver.find_element_by_xpath(".//*[contains(text(), '次へ')]").click()
  107. time.sleep(2)
  108.  
  109. print("############################## Step 1: Successful ############################################")
  110.  
  111.  
  112. # driver.save_screenshot(_dir + "/login.png") # ログイン済みを画像で確認できます
  113.  
  114. # Step 2 of the login: Login with user ID (email) and password
  115.  
  116. driver.find_element_by_name("user_id").send_keys(username)
  117. time.sleep(1)
  118. driver.find_element_by_name("user_passwd").send_keys(pass_word)
  119. time.sleep(1)
  120.  
  121.  
  122. # https://stackoverflow.com/questions/27927964/selenium-element-not-visible-exception
  123.  
  124.  
  125. """
  126. Hitting tab 5 times to reach the desired element and then hitting enter
  127. This code block also works with range(5) and F12 key -> Confirm why"""
  128.  
  129. for i in range(6):
  130. # action.send_keys(keys.Keys.F12)
  131. action.send_keys(keys.Keys.TAB)
  132. action.perform()
  133.  
  134. action.send_keys(keys.Keys.RETURN)
  135. action.perform()
  136.  
  137.  
  138. #driver.find_element_by_xpath(".//*[contains(text(), 'ログイン')]").send_keys(Keys.RETURN)
  139. #driver.find_element_by_xpath(".//*[contains(text(), 'ログイン')]").click()
  140.  
  141. time.sleep(2)
  142.  
  143.  
  144. # Click the "つぎへ" button
  145. driver.find_element_by_xpath('//*[@class="rf-button-primary rf-block rf-medium"]').click()
  146. time.sleep(2)
  147.  
  148.  
  149. # Click the accept compliance policy button
  150. driver.find_element_by_xpath('//*[@class="btn-reset btn-round btn-red"]').click()
  151. time.sleep(1)
  152.  
  153.  
  154. # Click on データ分析 and then on アクセス分析
  155. driver.find_element_by_xpath('//*[@class="rms-nav-txt"]').click()
  156. time.sleep(2)
  157.  
  158. print("Click on データ分析 successful")
  159.  
  160.  
  161. # directly navigate to the page after clicking the アクセス分析
  162. driver.get("https://mainmenu.rms.rakuten.co.jp/?left_navi=32")
  163.  
  164.  
  165. print("Click on アクセス分析 successful")
  166. time.sleep(2)
  167.  
  168.  
  169. # click on the 検索キーワード button
  170. driver.find_element_by_id("mm_sub0303_05").click()
  171. time.sleep(2)
  172.  
  173. print("click on the 検索キーワード button : Successful")
  174.  
  175. # idea 1: try tab 16(?) times and hit enter OR hit enter, tab 15 times and then hit enter again
  176. # idea 2: directly navigate to the page by using the hyperlink
  177. # https://rdatatool.rms.rakuten.co.jp/access/?menu=pc&evt=RT_P11_02&stat=1&owin=
  178.  
  179. driver.get("https://rdatatool.rms.rakuten.co.jp/access/?menu=pc&evt=RT_P11_02&stat=1&owin=")
  180. print("Click on トラキング link: Successful")
  181.  
  182.  
  183. driver.find_element_by_id(device_nav_code).click()
  184. print("Click on radio button: Successful")
  185.  
  186.  
  187. # Select a value from the 対象の動作 dropdown
  188. dropdown_target_action = driver.find_element_by_name("limit")
  189. for option in dropdown_target_action.find_elements_by_tag_name('option'):
  190. if option.text == users_dropdown_option: # users_dropdown_option is set at the top of the page
  191. option.click() # select() in earlier versions of webdriver
  192. break
  193. print("Selection of the value from the 対象の動作 dropdown: Successful")
  194.  
  195.  
  196. # Try submit. If not successful then use click
  197. driver.find_element_by_name("select_day").submit()
  198. print("Click on データ表示: Successful")
  199.  
  200.  
  201. csvFile = open(dateBlock + "rakutenRMS_data.csv", 'w', newline='', encoding='shift_jis')
  202. writer = csv.writer(csvFile)
  203.  
  204. flag_continue = 1
  205.  
  206. all_users_page_index = 1
  207. # user_infopage_index = 1
  208. tracking_page_index = 1
  209.  
  210. user = ""
  211. access_count = ""
  212. user_action = ""
  213.  
  214.  
  215. tracking_urls = []
  216.  
  217. # ------------------------------------------------------------------------------
  218. # Total number of tables in the page: 49
  219. # Table with the pager links: 38,41
  220. # Table with the user hyperlinks, access count, user action: 40
  221. # print("no of tables in the page:", len(table))
  222. # -------------------------------------------------------------------------------
  223.  
  224. # for t in range(len(table)):
  225. # print("================================== table number: " + str(t) + " ====================================")
  226. # print(table[t])
  227.  
  228.  
  229. while flag_continue == 1:
  230. all_users_page_index += 1
  231. obj_bs = BeautifulSoup(driver.page_source, "lxml")
  232.  
  233. try:
  234. table = obj_bs.findAll("table")[40]
  235. except Exception:
  236. print("\n ------- Inside the first except block -------- \n")
  237. flag_continue = 0
  238. break
  239.  
  240. rows = table.findAll("tr")
  241.  
  242. for row in rows:
  243. flag_data_user = 0
  244. list_user_info = row.findAll(['td'])
  245. csvRow = []
  246.  
  247. if len(list_user_info) == 0:
  248. # confirm: put a print statement here to check if code flows here or not
  249. next
  250.  
  251. # Getting hashed user IDs
  252. for cell_index in list_user_info:
  253. hashed_user_id = str(cell_index.get_text())
  254. if len(hashed_user_id) > 32:
  255. hashed_user_id = hashed_user_id[:32]
  256. csvRow.append(hashed_user_id)
  257.  
  258. # Getting URLs pointing to the user's tracking details
  259. for cell in list_user_info:
  260. user_tracking_link = str(cell.find("a"))
  261. locate_start = user_tracking_link.find('a href="')
  262. locate_end = user_tracking_link.find('" target="')
  263.  
  264. # if locate_start == -1 or locate_end == -1:
  265. #
  266. # print("User link not found")
  267. #
  268. # else:
  269. # text_trim = user_tracking_link[locate_start + 8: locate_end]
  270. # print("text_to_add is now:", text_trim)
  271. # csvRow.append(text_trim)
  272. # flag_data_user = 1
  273.  
  274. # alternate form of the above code block # De Morgan's Law
  275. if locate_start != -1 and locate_end != -1:
  276. text_trim = user_tracking_link[locate_start + 8: locate_end]
  277. tracking_urls.append(text_trim)
  278. # print("text_to_add is now:", text_trim)
  279.  
  280. flag_data_user = 1
  281. else:
  282. font_tag_info = str(cell.find("font"))
  283. color_start = font_tag_info.find("color=")
  284.  
  285. #TODO: Check and delete the else block
  286. # if color_start != -1:
  287. # color_code = font_tag_info[color_start + 7:color_start + 7 + 7]
  288. # else:
  289. # color_code = "" # Color Info not found, set color_code to an empty string
  290.  
  291. color_code = font_tag_info[color_start + 7:color_start + 7 + 7] if color_start != -1 else ""
  292.  
  293. text_def = str(cell.get_text())
  294. text_trim = text_def.encode("shift_jis", errors="ignore").decode("shift_jis", "ignore")
  295. text_trim = "*" + text_trim if color_code == "#CC0000" else text_trim
  296.  
  297.  
  298. csvRow.append(text_trim)
  299.  
  300. if flag_data_user == 1:
  301. writer.writerow(csvRow)
  302.  
  303. try:
  304. # code to navigate to the next page here
  305.  
  306. # example urls
  307. # https://rdatatool.rms.rakuten.co.jp/access/?menu=pc&evt=RT_P11_02&page=2&owin= # page 2
  308. # https://rdatatool.rms.rakuten.co.jp/access/?menu=pc&evt=RT_P11_02&page=3&owin= # page 3
  309. all_users_link_to_check = "https://rdatatool.rms.rakuten.co.jp/access/?menu=" + device_nav_code + "&evt=RT_P11_02&page=" + str(all_users_page_index) + "&owin="
  310. driver.get(all_users_link_to_check)
  311. print("Now checking: " + all_users_link_to_check)
  312. time.sleep(random.randint(1, 3))
  313.  
  314. except Exception:
  315. flag_continue = 0
  316. print(" \n ----- Inside the second except block -------- \n ")
  317. break
  318.  
  319. print("############### CSV File with user links created ############### ")
  320. # input("############ Enter a key to scrape user tracking details ############ ")
  321. csvFile.close()
  322.  
  323.  
  324. # Open new csv file for the main tracking behaviour
  325. csvFile_tracking = open("Tracking_" + dateBlock + "rakutenRMS_data.csv", 'w', newline='', encoding='shift_jis')
  326. writer2 = csv.writer(csvFile_tracking)
  327.  
  328. main_window = driver.current_window_handle
  329.  
  330.  
  331. for index in range(len(tracking_urls)):
  332. # for index in range(3): # Checking only the first 3 users for testing
  333. flag_continue_tracking = 1
  334. tracking_page_index += 1
  335.  
  336. url_to_click = tracking_urls[index].replace("&", "&")
  337. driver.get(url_to_click)
  338. print("[User:" + str(index + 1) + "] Now opening this url (Tracking): " + url_to_click)
  339.  
  340. driver.switch_to.window(driver.window_handles[-1])
  341.  
  342. obj_bs2 = BeautifulSoup(driver.page_source, "lxml")
  343.  
  344. while flag_continue_tracking == 1:
  345.  
  346. try:
  347. table = obj_bs2.findAll("table")[17] # Looking into the 17th table for the main tracking details
  348.  
  349. except Exception:
  350. print("\n\n\n Inside the first except block : Tracking\n")
  351. print("Table with user info not found")
  352. print("-------------------------------------------------")
  353. flag_continue_tracking = 0
  354. break
  355.  
  356. outer_tr = table.findAll("tr")[0] # The inner table is contained in the first <tr>
  357. inner_trs = outer_tr.findAll("tr")
  358.  
  359.  
  360. for row in inner_trs:
  361. flag_tracking_page = 0 # not used?
  362. list_user_tracking_info = row.findAll(['td'])
  363. csvRow2 = []
  364.  
  365. if len(list_user_tracking_info) == 0:
  366. print("-----------------> I am here <---------------------") # Confirm if code flows here
  367. next
  368.  
  369. for cell_tracking_index in range(len(list_user_tracking_info)):
  370.  
  371. text_def = str(list_user_tracking_info[cell_tracking_index].get_text())
  372. text_trim = text_def.encode("shift_jis", errors="ignore").decode("shift_jis", "ignore")
  373. csvRow2.append(text_trim)
  374.  
  375.  
  376. writer2.writerow(csvRow2)
  377.  
  378. try:
  379. driver.find_element_by_id("page_next1").click() # Clicking the "次の30件>>" link to navigate
  380.  
  381.  
  382. time.sleep(random.randint(1, 3))
  383. action.send_keys(keys.Keys.F5)
  384. # driver.implicitly_wait(15)
  385. # ------ Trying the selenium webdriver wait stmt: Start
  386. # try:
  387. # element = WebDriverWait(driver, 10).until(
  388. # EC.presence_of_element_located((By.ID, "page_next1"))
  389. # )
  390. # except Exception:
  391. # print("==== There was an error ====")
  392. # input("Press any key to continue:")
  393. # break
  394.  
  395. # ------ Trying the selenium webdriver wait stmt: End
  396.  
  397.  
  398. # action.key_down(Keys.CONTROL).click(driver.find_element_by_id("page_next1")).key_up(Keys.CONTROL).perform()
  399. # driver.switch_to.window(driver.window_handles[-1])
  400.  
  401. #tracking_page_index += 1
  402.  
  403. #tracking_link_to_check = "https://rdatatool.rms.rakuten.co.jp/access/?menu=" + device_nav_code + "&evt=RT_P11_01&page=" + str(tracking_page_index) + "&owin="
  404. #driver.get(tracking_link_to_check)
  405. # driver.execute_script("window.open('');")
  406. # time.sleep(3)
  407. # driver.switch_to.window(driver.window_handles[-2])
  408.  
  409. # input("Check active window:") # Check the execution of this line
  410. # print("Now checking: " + tracking_link_to_check)
  411. time.sleep(random.randint(1, 3))
  412.  
  413. except Exception:
  414. flag_continue_tracking = 0
  415. print("\n ----- Inside the second except block : Tracking -------- \n")
  416. time.sleep(random.randint(1, 3))
  417. break
  418.  
  419.  
  420. print("\n\n================== All rows printed and saved to the csv ======================\n\n")
  421.  
  422. csvFile_tracking.close()
  423. driver.close()
  424. driver.quit()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement