tailor4less_scraper.py

#!/usr/bin/python3
#coding: utf8

import requests
from pprint import pprint
from random import randint, uniform
from time import sleep
from pyquery import PyQuery as pq
import csv
import itertools
import re, demjson
import decimal
from random import uniform


# The same as range() function but for decimals
class drange():
    def __init__(self, start, stop, step = 1):
        self.start = decimal.Decimal(start)
        self.stop = decimal.Decimal(stop)
        self.step = decimal.Decimal(step)
        self.value = self.start

    def __iter__(self):
        self.value = self.start
        return self

    def __next__(self):
        if self.step > 0 and self.value < self.stop or self.step < 0 and self.value > self.stop:
            current = self.value
            self.value += self.step
            return current
        else:
            raise StopIteration()

    def __len__(self):
        return max(0, int((self.stop - self.start) / self.step + 1))


# Settings
#ranges = {
#   'man': {
#       'weights': range(126, 226 + 1, 1),
#       'height': drange(45, 150 + 0.5, 0.5),
#   },
#
#   'woman': {
#       'weights': range(126, 200 + 1, 1),
#       'height': drange(30, 140 + 0.5, 0.5),
#   },
#}

ranges = {
    'man': {
        'weights': range(126, 127 + 1, 1),
        'heights': drange(45, 45 + 0.5, 0.5),
    },

    'woman': {
        'weights': range(126, 126 + 1, 1),
        'heights': drange(30, 30 + 0.5, 0.5),
    },
}


for gender in ('woman', 'man'):
    total = 0
    print('==============================')
    print('*** Start scraping for %s ***' % gender)
    with open('measurements_%s.csv' % gender, 'wt') as f:

        # Prepare object for queries
        if gender == 'man':
            base_url = 'https://www.tailor4less.com/en-uk'
        elif gender == 'woman':
            base_url = 'https://www.sumissura.com/en-uk'

        requests.packages.urllib3.disable_warnings()
        headers = {
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "Accept-Language": "en-US;q=0.6,en;q=0.4",
            "Referer": base_url,
            "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36"
        }
        session = requests.session()
        session.headers = headers


        writer = None


        # We need to add various kinds of products to cart due to get all available measurements options in the calculator
        if gender == 'man':
            products = (
                4839,
                4854,
                4746,
                646,
                616,
                378,
                5856,
                6239,
                5584,
                2092
            )

        elif gender == 'woman':
            products = (
                7053,
                6809,
                7315,
                7055,
                6420,
                6969,
                7652,
                7660,
                7314,
                7129,
                6713,
                7559,
                7045,
                7045,
                6978,
                438,
                7563,
                442,
                7562
            )

        print('Adding products to cart...')
        for product in products:
            url = '%s/feed/buy/%d' % (base_url, product)
            r = session.get(url)
        print('Done!')


        # Recieving all available constitutions
        print('Loading constitutions variants...')
        constitutions = {}
        url = '%s/checkout/measures/?step=start' % (base_url)
        r = session.get(url)
        dom = pq(r.text)
        for elem in dom('.constitutions .constitution input[type="radio"]'):
            elem = pq(elem)
            name, value = elem.attr('name'), elem.attr('value')
            if not constitutions.get(name):
                constitutions[name] = []
            constitutions[name].append(value)
        print('Constitutions:')
        pprint(constitutions)
        print('Done!')


        default_params = {}
        for elem in dom('.process .inputs .input > input'):
            elem = pq(elem)
            name, value = elem.attr('name'), elem.attr('value')
            default_params[name] = value
        #pprint(default_params)


        #print(r.text)
        m = re.findall(r"var options = (\{[^=]*\});\s*<\/script>", r.text, re.DOTALL)
        options = demjson.decode(m[0])
        #pprint(options)
        #input()
        #continue


        # Loop for all combinations of heights, weights and constitutions
        for height in ranges[gender]['heights']:
            for weight in ranges[gender]['weights']:
                    keys = sorted(constitutions)
                    constitutions_combs = [dict(zip(keys, p)) for p in itertools.product(*(constitutions[k] for k in keys))]


                    # Constitutions does not affect to measurements for woman (seems that it is site bug)
                    if gender == 'woman':
                        constitutions_combs = [{}]


                    for constitutions_comb in constitutions_combs:
                        print('Height: %s, Weight: %s' % (height, weight))
                        print('Constitution:')
                        pprint(constitutions_comb)

                        err = False
                        while True:
                            try:
                                url = '%s/%s/measures/estimate' % (base_url, gender)
                                params = default_params.copy()
                                params.update({
                                    'weight_units':'kg',
                                    'length_units':'cm',
                                    'weight':weight,
                                    'height':height,
                                })
                                params.update(constitutions_comb)
                                r = session.get(url, params=params, timeout=20)
                                res = r.json()
                                err = False
                            except Exception as msg:
                                print('ERROR:', msg)
                                err = True
                                sleep(uniform(5,30))
                                print('Retry')

                            if not err:
                                break;


                        # Calculate average min and max values for each measurement
                        for measure_type in list(res):
                            range_text = ''

                            try:
                                measure = float(res[measure_type])

                                if measure > 0:
                                    optionals = options['measures']['optional']
                                    requireds = options['measures']['required']
                                    rng = None


                                    if optionals:
                                        if measure_type in optionals:
                                            rng = optionals[measure_type]


                                    if requireds:
                                        if measure_type in requireds:
                                            rng = requireds[measure_type]


                                    if rng:
                                        if ('range' in rng) and len(rng.get('range')) == 2:
                                            min = measure * (rng['range'][0] / 100 + 1)
                                            max = measure * (rng['range'][1] / 100 + 1)
                                        else:
                                            min = measure * (rng['range']['tolerance']['normal']['max'][0] / 100 + 1)
                                            max = measure * (rng['range']['tolerance']['normal']['max'][1] / 100 + 1)


                                        min = round(min)
                                        max = round(max)

                                        range_text = 'Usually between %dcm and %dcm' % (min, max)
                            except Exception as msg:
                                print('ERROR:', msg)
                                range_text = ''

                            res[measure_type + '_range'] = range_text


                        print('Calculated measurements:')
                        pprint(res)


                        # Prepare data for writing in csv
                        data = {
                            'in': params,
                            'out': res
                        }


                        if (not writer):
                            # Now we know names of all columns and can write them in CSV
                            writer = csv.DictWriter(f, ['height', 'length_units', 'weight', 'weight_units'] + sorted(constitutions.keys() if gender != 'woman' else []) + sorted(data['out'].keys()))
                            writer.writeheader()


                        all_data = data['in'].copy()
                        all_data.update(data['out'])
                        writer.writerow(all_data)


                        total += 1
                        print('Progress: %d/%d' % (total, we))
                        print()

                        f.flush()

    print()
    print('*** Scraping for %s done! ***' % gender)
    print()