Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import {Dataset, createPuppeteerRouter, utils, RetryRequestError, enqueueLinks, Request} from 'crawlee';
- import parsePrice from 'parse-price';
- export const router = createPuppeteerRouter();
- let productsCount = 0
- let pids = new Set()
- let goodCookies;
- router.addDefaultHandler(async ({ enqueueLinks, log, page, request, session, crawler }) => {
- if (request.loadedUrl === 'https://www.elcorteingles.es/supermercado/') {
- let content = await page.content()
- let securityCheck = await checkForChallengeValidation(page, request, content, log)
- if (!securityCheck){
- throw new RetryRequestError(`Security challenge wildly appeared. Retrying: ${request.loadedUrl}`)
- }
- log.info('Homepage opened, going to sign in')
- await Promise.all([
- page.waitForNavigation(),
- page.click('a.ts-login-desktop'),
- ]);
- log.info(`Login in progress on ${request.loadedUrl}`);
- log.info('Writing email...')
- await wait(3000)
- await page.type('#login', ''); // Dragos's account
- await wait(3000)
- log.info('Writing password...')
- await page.type('#password', ''); // Dragos's account
- log.info('Clicking on login button')
- await Promise.all([
- page.waitForNavigation(),
- page.click('#login-btn'),
- ]);
- crawler.autoscaledPool.maxConcurrency = 4
- }
- await page.setViewport({
- width: 1920,
- height: 6000,
- });
- if (request.url.includes('/ayuda/')){
- return;
- }
- let content = await page.content()
- let securityCheck = await checkForChallengeValidation(page, request, content, log)
- if (!securityCheck){
- throw new RetryRequestError(`Security challenge wildly appeared. Retrying: ${request.loadedUrl}`)
- }
- await enqueueLinks({
- // regexps: [/(\/supermercado\/[a-zA-Z]+)/],
- selector: '[rel="next"]',
- label: 'category'
- });
- // log.info(`enqueueing new categories URLs`);
- await enqueueLinks({
- selector: 'a.top_menu-item',
- label: 'category'
- });
- await enqueueLinks({
- // regexps: [/(\/supermercado\/[0-9]+)/],
- selector: 'a.js-product-link',
- label: 'product',
- forefront: true,
- });
- await page.close()
- page = null;
- });
- router.addHandler('cateogry', async ({ request, page, log }) => {
- await page.setViewport({
- width: 1920,
- height: 6000,
- });
- const content = await page.content()
- let securityCheck = await checkForChallengeValidation(page, request, content, log)
- if (!securityCheck){
- throw new RetryRequestError(`Security challenge wildly appeared. Retrying: ${request.loadedUrl}`)
- }
- await enqueueLinks({
- // regexps: [/(\/supermercado\/[a-zA-Z]+)/],
- selector: '[rel="next"]',
- label: 'category'
- });
- await enqueueLinks({
- // regexps: [/(\/supermercado\/[0-9]+)/],
- selector: 'a.js-product-link',
- label: 'product',
- forefront: true,
- });
- await page.close()
- page = null;
- });
- router.addHandler('product', async ({ request, page, log, crawler, parseWithCheerio }) => {
- await page.setViewport({
- width: 1920,
- height: 6000,
- });
- const content = await page.content()
- let securityCheck = await checkForChallengeValidation(page, request, content, log);
- if (!securityCheck){
- throw RetryRequestError
- }
- if (content.includes('Access Denied')){
- await wait(10000)
- throw RetryRequestError
- }
- const $ = await utils.puppeteer.parseWithCheerio(page);
- let idCenter = null;
- $('script').each(async (index, element) => {
- const scriptContent = $(element).html();
- const idCenterMatch = scriptContent.match(/"id_center":"([^"]+)"/);
- if (idCenterMatch) {
- idCenter = idCenterMatch[1];
- log.warning(`Found idCenter ${idCenter}`);
- return false; // Stop iterating once the id_center is found
- }
- });
- if (!(idCenter === '010002')) {
- crawler.autoscaledPool.maxConcurrency = 1
- await crawler.autoscaledPool.pause()
- let signInReq = new Request({
- url: 'https://www.elcorteingles.es/supermercado/',
- uniqueKey: Math.random().toString(36).substr(2, 5)
- })
- await crawler.addRequests([signInReq], {forefront: true, waitForAllRequestsToBeAdded: true})
- await crawler.autoscaledPool.resume()
- throw new RetryRequestError(`Shop location changed. Retrying: ${request.loadedUrl}`)
- }
- let timestamp = Date.now()
- let productUrl = request.loadedUrl
- let pidRegex1 = /A\d+/;
- let pidRegex2 = /MP_\d+_\d+/;
- let pidRegex3 = /\d+/;
- let productId;
- if (productUrl.match(pidRegex1) && productUrl.match(pidRegex1).length > 0){
- productId = productUrl.match(pidRegex1)[0]
- } else if (productUrl.match(pidRegex2) && productUrl.match(pidRegex2).length > 0){
- productId = productUrl.match(pidRegex2)[0]
- } else if (productUrl.match(pidRegex3) && productUrl.match(pidRegex3).length > 0){
- productId = productUrl.match(pidRegex3)[0];
- }
- if (!productId){
- throw RetryRequestError
- }
- if (pids.has(productId)){
- return
- } else {
- pids.add(productId)
- }
- let priceOld;
- let price;
- if ($('div.prices-price._current span').length > 0){
- let priceData = $('div.prices-price._current')
- let priceDataInt = priceData.find('span').eq(0).text();
- let priceDataCents = priceData.find('span').eq(1).text();
- price = parsePrice(priceDataInt + '.' + priceDataCents)
- } else if($('div.prices-price._offer span').length > 0 ) {
- let priceData = $('div.prices-price._offer')
- let priceDataInt = priceData.find('span').eq(0).text();
- let priceDataCents = priceData.find('span').eq(1).text();
- let priceOldData = $('div.prices-price._before')
- let priceOldDataInt = priceOldData.find('span').eq(0).text();
- let priceOldDataCents = priceOldData.find('span').eq(1).text();
- price = parsePrice(priceDataInt + '.' + priceDataCents)
- priceOld = parsePrice(priceOldDataInt + '.' + priceOldDataCents)
- } else if ($('div.prices-price._offer').text().length > 0) {
- let priceData = $('div.prices-price._offer').text()
- let priceOldData = $('div.prices-price._before').text()
- price = parsePrice(priceData)
- priceOld = parsePrice(priceOldData)
- } else if ($('div.prices-price._current').text().length > 0){
- let priceData = $('div.prices-price._current').text()
- price = parsePrice(priceData)
- }
- if (price === 0){
- log.warning(`Error1: price 0 on ${request.loadedUrl}`)
- }
- if (priceOld === 0){
- log.warning(`Error1: priceOld 0 on ${request.loadedUrl}`)
- }
- let images = 'https:' + $('img.js-zoom-to-modal-image').attr('src')
- let descriptionData = $('ul.info-list')
- let description = ''
- descriptionData.find('li').each(function (index, element) {
- description += $(element).text().trim() + ' ';
- });
- let brand = $('span[itemprop="brand"]').text();
- let categoriesData = $('ol')
- let categoriesList = []
- categoriesData.find('li').slice(0, -1).each(function (index, element) {
- categoriesList.push($(element).text().trim());
- });
- let availability;
- if ($('button.js-add-item').attr('aria-label') === 'añadir al carro'){
- availability = true;
- } else {
- availability = false;
- }
- let productNameData = $('title').text()
- let productNameCleaned = productNameData.split("·")[0].trim()
- let productName = `${brand} ${productNameCleaned}`
- console.log(productName)
- let export_dict = {
- 'url': `${productUrl}#010002`,
- 'product_id': [productId],
- 'timestamp': [String(timestamp)],
- 'name': [productName],
- 'price_old': [String(priceOld)],
- 'price': [String(price)],
- 'image': [images],
- 'description': [description],
- 'brand_text': [brand],
- 'categories': categoriesList,
- 'availability': [availability],
- }
- await Dataset.pushData(export_dict);
- productsCount ++
- console.dir(export_dict)
- log.info(`${productsCount} products scraped`)
- await page.close()
- export_dict = null;
- });
- function wait(ms) {
- return new Promise(resolve => setTimeout(resolve, ms));
- }
- async function checkForChallengeValidation(page, request, content, log, waitTime = 8000) {
- await page.setDefaultNavigationTimeout(0);
- if (content.includes('/_sec/cp_challenge/ak-challenge-3-10.htm')) {
- // waitTime += 1000 // add an additional second to the wait time
- console.warn(`Waiting ${waitTime / 1000} seconds, security challenge appeared on ${request.loadedUrl}`)
- await wait(waitTime)
- content = await page.content()
- // await page.waitForTimeout(2000);
- await page.reload({timeout: 0, waitUntil: 'networkidle2'})
- return checkForChallengeValidation(page, request, content, log, waitTime) // recursive call
- } else {
- await page.content()
- return true
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment