Advertisement
Niftl

Simple HTML parser using Cookies to handle the login

Apr 11th, 2019
212
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.39 KB | None | 0 0
  1. import os, http.cookiejar, urllib.request
  2. import re,csv
  3. from glob import glob
  4.  
  5. f = open("downloadProducts.sh",'w', encoding='utf-8')
  6.  
  7. for i in range(1,57):
  8.     url = "https://b2b.example.com/page/"+str(i)+"/?s=parameterHere" # this is from a wordpress site
  9.  
  10.     cj = http.cookiejar.MozillaCookieJar()
  11.     cj.load(os.path.join(os.path.expanduser("~"), ".netscape", "cookies.txt")) # path from local cookie file (downloaded from Chrome)
  12.     opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
  13.     r = opener.open(url)
  14.  
  15.     for lines in r.readlines():
  16.         if '<h2 class="entry-title">' in str(lines): # in case of encoding error (since site is using Greek language) use decode cp1252
  17.             flag = 1
  18.             match = re.search(r'href=[\'"]?([^\'" >]+)', str(lines))
  19.             if match:
  20.                 productURL = match.group(0).replace('href="',"")
  21.                 array = productURL.split('/')
  22.                 size = len(array)
  23.                 sku = array[size-2].split('-')[len(array[size-2].split('-'))-1] # format of product URL looks like this https://b2b.example.com/product/necklace-base-metal-sku/
  24.                 f.write('curl --cookie cookies.txt "' + productURL + '" -o ' + sku +'.html\n') # https://stackoverflow.com/questions/55608561/python3-issue-using-cookiejar-and-urllib-request since there is no responce from there, the other method is to use curl
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement