Guest User

Untitled

a guest
Jul 23rd, 2018
73
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.41 KB | None | 0 0
  1. from selenium import webdriver
  2. from selenium.webdriver.common.by import By
  3. from selenium.webdriver.support.ui import WebDriverWait
  4. from urllib import request
  5. from bs4 import BeautifulSoup
  6. import requests
  7. from urllib.parse import urljoin
  8. import openpyxl as op
  9. import datetime
  10. import time
  11.  
  12.  
  13. def change_window(browser):
  14. all_handles = set(browser.window_handles)
  15. switch_to = all_handles - set([browser.current_window_handle])
  16. assert len(switch_to) == 1
  17. browser.switch_to.window(*switch_to)
  18.  
  19.  
  20. def main():
  21. for i in range(1,9):
  22. wb = op.load_workbook('一般名称.xlsx')
  23. ws = wb.active
  24. word = ws['A'+str(i)].value
  25.  
  26. driver = webdriver.Chrome(r'C:/chromedriver.exe')
  27. driver.get("https://www.pmda.go.jp/PmdaSearch/kikiSearch/")
  28. #id検索
  29. elem_search_word = driver.find_element_by_id("txtName")
  30. elem_search_word.send_keys(word)
  31. #name検索
  32. elem_search_btn = driver.find_element_by_name('btnA')
  33. elem_search_btn.click()
  34. change_window(driver)
  35.  
  36. #print(driver.page_source)
  37. cur_url = driver.current_url
  38. html = driver.page_source
  39. soup = BeautifulSoup(html,'html.parser')
  40. #print(cur_url)
  41.  
  42. has_pdf_link = False
  43. print(word)
  44.  
  45. wb = op.load_workbook('URL_DATA.xlsx')
  46. ws = wb.active
  47. ws['C'+str(i)].value = word
  48.  
  49. for a_tag in soup.find_all('a'):
  50. link_pdf = (urljoin(cur_url, a_tag.get('href')))
  51. #link_PDFから文末がpdfと文中にPDFが入っているものを抽出
  52. #print(word)
  53.  
  54. if (not link_pdf.lower().endswith('.pdf')) and ('/ResultDataSetPDF/' not in link_pdf):
  55. continue
  56. if ('searchhelp' not in link_pdf):
  57. has_pdf_link = True
  58. print(link_pdf)
  59. ws['B'+str(i)].value = link_pdf
  60.  
  61. if not has_pdf_link:
  62. print('False')
  63. ws['B'+str(i)].value = has_pdf_link
  64.  
  65. time.sleep(2)
  66. time_data = datetime.datetime.today()
  67.  
  68. ws['A'+str(i)].value = time_data
  69.  
  70. #wb = op.load_workbook('URL_DATA.xlsx')
  71. #ws = wb.active
  72. #時間を記入
  73. #ws['A'+str(i)].value = time_data
  74. #URLを記入
  75. #ws['B'+str(i)].value = link_pdf
  76. #一般名称を記入
  77. #ws['C'+str(i)].value = word
  78.  
  79. wb.save('URL_DATA.xlsx')
  80.  
  81.  
  82.  
  83. if __name__ == "__main__":
  84. main()
Add Comment
Please, Sign In to add comment