Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python3
- #coding: utf8
- import requests
- from pprint import pprint
- from random import randint, uniform
- from time import sleep
- from pyquery import PyQuery as pq
- import csv
- import itertools
- import re, demjson
- import decimal
- from random import uniform
- # The same as range() function but for decimals
- class drange():
- def __init__(self, start, stop, step = 1):
- self.start = decimal.Decimal(start)
- self.stop = decimal.Decimal(stop)
- self.step = decimal.Decimal(step)
- self.value = self.start
- def __iter__(self):
- self.value = self.start
- return self
- def __next__(self):
- if self.step > 0 and self.value < self.stop or self.step < 0 and self.value > self.stop:
- current = self.value
- self.value += self.step
- return current
- else:
- raise StopIteration()
- def __len__(self):
- return max(0, int((self.stop - self.start) / self.step + 1))
- # Settings
- #ranges = {
- # 'man': {
- # 'weights': range(126, 226 + 1, 1),
- # 'height': drange(45, 150 + 0.5, 0.5),
- # },
- #
- # 'woman': {
- # 'weights': range(126, 200 + 1, 1),
- # 'height': drange(30, 140 + 0.5, 0.5),
- # },
- #}
- ranges = {
- 'man': {
- 'weights': range(126, 127 + 1, 1),
- 'heights': drange(45, 45 + 0.5, 0.5),
- },
- 'woman': {
- 'weights': range(126, 126 + 1, 1),
- 'heights': drange(30, 30 + 0.5, 0.5),
- },
- }
- for gender in ('woman', 'man'):
- total = 0
- print('==============================')
- print('*** Start scraping for %s ***' % gender)
- with open('measurements_%s.csv' % gender, 'wt') as f:
- # Prepare object for queries
- if gender == 'man':
- base_url = 'https://www.tailor4less.com/en-uk'
- elif gender == 'woman':
- base_url = 'https://www.sumissura.com/en-uk'
- requests.packages.urllib3.disable_warnings()
- headers = {
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
- "Accept-Language": "en-US;q=0.6,en;q=0.4",
- "Referer": base_url,
- "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36"
- }
- session = requests.session()
- session.headers = headers
- writer = None
- # We need to add various kinds of products to cart due to get all available measurements options in the calculator
- if gender == 'man':
- products = (
- 4839,
- 4854,
- 4746,
- 646,
- 616,
- 378,
- 5856,
- 6239,
- 5584,
- 2092
- )
- elif gender == 'woman':
- products = (
- 7053,
- 6809,
- 7315,
- 7055,
- 6420,
- 6969,
- 7652,
- 7660,
- 7314,
- 7129,
- 6713,
- 7559,
- 7045,
- 7045,
- 6978,
- 438,
- 7563,
- 442,
- 7562
- )
- print('Adding products to cart...')
- for product in products:
- url = '%s/feed/buy/%d' % (base_url, product)
- r = session.get(url)
- print('Done!')
- # Recieving all available constitutions
- print('Loading constitutions variants...')
- constitutions = {}
- url = '%s/checkout/measures/?step=start' % (base_url)
- r = session.get(url)
- dom = pq(r.text)
- for elem in dom('.constitutions .constitution input[type="radio"]'):
- elem = pq(elem)
- name, value = elem.attr('name'), elem.attr('value')
- if not constitutions.get(name):
- constitutions[name] = []
- constitutions[name].append(value)
- print('Constitutions:')
- pprint(constitutions)
- print('Done!')
- default_params = {}
- for elem in dom('.process .inputs .input > input'):
- elem = pq(elem)
- name, value = elem.attr('name'), elem.attr('value')
- default_params[name] = value
- #pprint(default_params)
- #print(r.text)
- m = re.findall(r"var options = (\{[^=]*\});\s*<\/script>", r.text, re.DOTALL)
- options = demjson.decode(m[0])
- #pprint(options)
- #input()
- #continue
- # Loop for all combinations of heights, weights and constitutions
- for height in ranges[gender]['heights']:
- for weight in ranges[gender]['weights']:
- keys = sorted(constitutions)
- constitutions_combs = [dict(zip(keys, p)) for p in itertools.product(*(constitutions[k] for k in keys))]
- # Constitutions does not affect to measurements for woman (seems that it is site bug)
- if gender == 'woman':
- constitutions_combs = [{}]
- for constitutions_comb in constitutions_combs:
- print('Height: %s, Weight: %s' % (height, weight))
- print('Constitution:')
- pprint(constitutions_comb)
- err = False
- while True:
- try:
- url = '%s/%s/measures/estimate' % (base_url, gender)
- params = default_params.copy()
- params.update({
- 'weight_units':'kg',
- 'length_units':'cm',
- 'weight':weight,
- 'height':height,
- })
- params.update(constitutions_comb)
- r = session.get(url, params=params, timeout=20)
- res = r.json()
- err = False
- except Exception as msg:
- print('ERROR:', msg)
- err = True
- sleep(uniform(5,30))
- print('Retry')
- if not err:
- break;
- # Calculate average min and max values for each measurement
- for measure_type in list(res):
- range_text = ''
- try:
- measure = float(res[measure_type])
- if measure > 0:
- optionals = options['measures']['optional']
- requireds = options['measures']['required']
- rng = None
- if optionals:
- if measure_type in optionals:
- rng = optionals[measure_type]
- if requireds:
- if measure_type in requireds:
- rng = requireds[measure_type]
- if rng:
- if ('range' in rng) and len(rng.get('range')) == 2:
- min = measure * (rng['range'][0] / 100 + 1)
- max = measure * (rng['range'][1] / 100 + 1)
- else:
- min = measure * (rng['range']['tolerance']['normal']['max'][0] / 100 + 1)
- max = measure * (rng['range']['tolerance']['normal']['max'][1] / 100 + 1)
- min = round(min)
- max = round(max)
- range_text = 'Usually between %dcm and %dcm' % (min, max)
- except Exception as msg:
- print('ERROR:', msg)
- range_text = ''
- res[measure_type + '_range'] = range_text
- print('Calculated measurements:')
- pprint(res)
- # Prepare data for writing in csv
- data = {
- 'in': params,
- 'out': res
- }
- if (not writer):
- # Now we know names of all columns and can write them in CSV
- writer = csv.DictWriter(f, ['height', 'length_units', 'weight', 'weight_units'] + sorted(constitutions.keys() if gender != 'woman' else []) + sorted(data['out'].keys()))
- writer.writeheader()
- all_data = data['in'].copy()
- all_data.update(data['out'])
- writer.writerow(all_data)
- total += 1
- print('Progress: %d/%d' % (total, we))
- print()
- f.flush()
- print()
- print('*** Scraping for %s done! ***' % gender)
- print()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement