Guest User

Untitled

a guest
Jun 13th, 2023
122
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import {Dataset, createPuppeteerRouter, utils, RetryRequestError, enqueueLinks, Request} from 'crawlee';
  2. import parsePrice from 'parse-price';
  3.  
  4.  
  5. export const router = createPuppeteerRouter();
  6. let productsCount = 0
  7. let pids = new Set()
  8. let goodCookies;
  9. router.addDefaultHandler(async ({ enqueueLinks, log, page, request, session, crawler }) => {
  10.     if (request.loadedUrl === 'https://www.elcorteingles.es/supermercado/') {
  11.         let content = await page.content()
  12.         let securityCheck = await checkForChallengeValidation(page, request, content, log)
  13.         if (!securityCheck){
  14.             throw new RetryRequestError(`Security challenge wildly appeared. Retrying: ${request.loadedUrl}`)
  15.         }
  16.         log.info('Homepage opened, going to sign in')
  17.         await Promise.all([
  18.             page.waitForNavigation(),
  19.             page.click('a.ts-login-desktop'),
  20.         ]);
  21.         log.info(`Login in progress on ${request.loadedUrl}`);
  22.         log.info('Writing email...')
  23.         await wait(3000)
  24.         await page.type('#login', ''); // Dragos's account
  25.         await wait(3000)
  26.         log.info('Writing password...')
  27.         await page.type('#password', ''); // Dragos's account
  28.         log.info('Clicking on login button')
  29.         await Promise.all([
  30.             page.waitForNavigation(),
  31.             page.click('#login-btn'),
  32.         ]);
  33.         crawler.autoscaledPool.maxConcurrency = 4
  34.     }
  35.  
  36.  
  37.     await page.setViewport({
  38.         width: 1920,
  39.         height: 6000,
  40.       });
  41.     if (request.url.includes('/ayuda/')){
  42.         return;
  43.     }
  44.     let content = await page.content()
  45.     let securityCheck = await checkForChallengeValidation(page, request, content, log)
  46.     if (!securityCheck){
  47.         throw new RetryRequestError(`Security challenge wildly appeared. Retrying: ${request.loadedUrl}`)
  48.     }
  49.  
  50.     await enqueueLinks({
  51.         // regexps: [/(\/supermercado\/[a-zA-Z]+)/],
  52.         selector: '[rel="next"]',
  53.         label: 'category'
  54.     });
  55.     // log.info(`enqueueing new categories URLs`);
  56.     await enqueueLinks({
  57.         selector: 'a.top_menu-item',
  58.         label: 'category'
  59.     });
  60.     await enqueueLinks({
  61.         // regexps: [/(\/supermercado\/[0-9]+)/],
  62.         selector: 'a.js-product-link',
  63.         label: 'product',
  64.         forefront: true,
  65.     });
  66.  
  67.     await page.close()
  68.     page = null;
  69. });
  70.  
  71.  
  72. router.addHandler('cateogry', async ({ request, page, log }) => {
  73.     await page.setViewport({
  74.         width: 1920,
  75.         height: 6000,
  76.       });
  77.  
  78.     const content = await page.content()
  79.     let securityCheck = await checkForChallengeValidation(page, request, content, log)
  80.     if (!securityCheck){
  81.         throw new RetryRequestError(`Security challenge wildly appeared. Retrying: ${request.loadedUrl}`)
  82.     }
  83.  
  84.     await enqueueLinks({
  85.         // regexps: [/(\/supermercado\/[a-zA-Z]+)/],
  86.         selector: '[rel="next"]',
  87.         label: 'category'
  88.     });
  89.     await enqueueLinks({
  90.         // regexps: [/(\/supermercado\/[0-9]+)/],
  91.         selector: 'a.js-product-link',
  92.         label: 'product',
  93.         forefront: true,
  94.     });
  95.     await page.close()
  96.     page = null;
  97. });
  98.  
  99. router.addHandler('product', async ({ request, page, log, crawler, parseWithCheerio }) => {
  100.  
  101.     await page.setViewport({
  102.         width: 1920,
  103.         height: 6000,
  104.       });
  105.     const content = await page.content()
  106.     let securityCheck = await checkForChallengeValidation(page, request, content, log);
  107.     if (!securityCheck){
  108.         throw RetryRequestError
  109.     }
  110.  
  111.     if (content.includes('Access Denied')){
  112.         await wait(10000)
  113.         throw RetryRequestError
  114.     }
  115.     const $ = await utils.puppeteer.parseWithCheerio(page);
  116.     let idCenter = null;
  117.  
  118.     $('script').each(async (index, element) => {
  119.         const scriptContent = $(element).html();
  120.         const idCenterMatch = scriptContent.match(/"id_center":"([^"]+)"/);
  121.         if (idCenterMatch) {
  122.             idCenter = idCenterMatch[1];
  123.             log.warning(`Found idCenter ${idCenter}`);
  124.             return false; // Stop iterating once the id_center is found
  125.         }
  126.     });
  127.     if (!(idCenter === '010002')) {
  128.         crawler.autoscaledPool.maxConcurrency = 1
  129.         await crawler.autoscaledPool.pause()
  130.         let signInReq = new Request({
  131.             url: 'https://www.elcorteingles.es/supermercado/',
  132.             uniqueKey: Math.random().toString(36).substr(2, 5)
  133.         })
  134.         await crawler.addRequests([signInReq], {forefront: true, waitForAllRequestsToBeAdded: true})
  135.         await crawler.autoscaledPool.resume()
  136.         throw new RetryRequestError(`Shop location changed. Retrying: ${request.loadedUrl}`)
  137.     }
  138.  
  139.     let timestamp = Date.now()
  140.     let productUrl = request.loadedUrl
  141.     let pidRegex1 = /A\d+/;
  142.     let pidRegex2 = /MP_\d+_\d+/;
  143.     let pidRegex3 = /\d+/;
  144.     let productId;
  145.     if (productUrl.match(pidRegex1) && productUrl.match(pidRegex1).length > 0){
  146.         productId = productUrl.match(pidRegex1)[0]
  147.     } else if (productUrl.match(pidRegex2) && productUrl.match(pidRegex2).length > 0){
  148.         productId = productUrl.match(pidRegex2)[0]
  149.     } else if (productUrl.match(pidRegex3) && productUrl.match(pidRegex3).length > 0){
  150.         productId = productUrl.match(pidRegex3)[0];
  151.     }
  152.     if (!productId){
  153.         throw RetryRequestError
  154.     }
  155.  
  156.     if (pids.has(productId)){
  157.         return
  158.     } else {
  159.         pids.add(productId)
  160.     }
  161.  
  162.     let priceOld;
  163.     let price;
  164.     if ($('div.prices-price._current span').length > 0){
  165.         let priceData = $('div.prices-price._current')
  166.         let priceDataInt = priceData.find('span').eq(0).text();
  167.         let priceDataCents = priceData.find('span').eq(1).text();
  168.         price = parsePrice(priceDataInt + '.' + priceDataCents)
  169.     } else if($('div.prices-price._offer span').length > 0 )  {
  170.         let priceData = $('div.prices-price._offer')
  171.         let priceDataInt = priceData.find('span').eq(0).text();
  172.         let priceDataCents = priceData.find('span').eq(1).text();
  173.         let priceOldData = $('div.prices-price._before')
  174.         let priceOldDataInt = priceOldData.find('span').eq(0).text();
  175.         let priceOldDataCents = priceOldData.find('span').eq(1).text();
  176.         price = parsePrice(priceDataInt + '.' + priceDataCents)
  177.         priceOld = parsePrice(priceOldDataInt + '.' + priceOldDataCents)
  178.     } else if ($('div.prices-price._offer').text().length > 0) {
  179.         let priceData = $('div.prices-price._offer').text()
  180.         let priceOldData = $('div.prices-price._before').text()
  181.         price = parsePrice(priceData)
  182.         priceOld = parsePrice(priceOldData)
  183.     } else if ($('div.prices-price._current').text().length > 0){
  184.         let priceData = $('div.prices-price._current').text()
  185.         price = parsePrice(priceData)
  186.     }
  187.     if (price === 0){
  188.         log.warning(`Error1: price 0 on ${request.loadedUrl}`)
  189.     }
  190.     if (priceOld === 0){
  191.         log.warning(`Error1: priceOld 0 on ${request.loadedUrl}`)
  192.     }
  193.     let images = 'https:' + $('img.js-zoom-to-modal-image').attr('src')
  194.     let descriptionData = $('ul.info-list')
  195.     let description = ''
  196.     descriptionData.find('li').each(function (index, element) {
  197.         description += $(element).text().trim() + ' ';
  198.     });
  199.     let brand = $('span[itemprop="brand"]').text();
  200.     let categoriesData = $('ol')
  201.     let categoriesList = []
  202.     categoriesData.find('li').slice(0, -1).each(function (index, element) {
  203.         categoriesList.push($(element).text().trim());
  204.     });
  205.     let availability;
  206.     if ($('button.js-add-item').attr('aria-label') === 'añadir al carro'){
  207.         availability = true;
  208.     } else {
  209.         availability = false;
  210.     }
  211.     let productNameData = $('title').text()
  212.     let productNameCleaned = productNameData.split("·")[0].trim()
  213.     let productName = `${brand} ${productNameCleaned}`
  214.     console.log(productName)
  215.     let export_dict = {
  216.         'url': `${productUrl}#010002`,
  217.         'product_id': [productId],
  218.         'timestamp': [String(timestamp)],
  219.         'name': [productName],
  220.         'price_old': [String(priceOld)],
  221.         'price': [String(price)],
  222.         'image': [images],
  223.         'description': [description],
  224.         'brand_text': [brand],
  225.         'categories': categoriesList,
  226.         'availability': [availability],
  227.     }
  228.     await Dataset.pushData(export_dict);
  229.     productsCount ++
  230.     console.dir(export_dict)
  231.     log.info(`${productsCount} products scraped`)
  232.     await page.close()
  233.     export_dict = null;
  234.  
  235. });
  236.  
  237. function wait(ms) {
  238.     return new Promise(resolve => setTimeout(resolve, ms));
  239. }
  240.  
  241. async function checkForChallengeValidation(page, request, content, log, waitTime = 8000) {
  242.     await page.setDefaultNavigationTimeout(0);
  243.     if (content.includes('/_sec/cp_challenge/ak-challenge-3-10.htm')) {
  244.     //   waitTime += 1000 // add an additional second to the wait time
  245.       console.warn(`Waiting ${waitTime / 1000} seconds, security challenge appeared on ${request.loadedUrl}`)
  246.       await wait(waitTime)
  247.       content = await page.content()
  248.     //   await page.waitForTimeout(2000);
  249.       await page.reload({timeout: 0, waitUntil: 'networkidle2'})
  250.       return checkForChallengeValidation(page, request, content, log, waitTime) // recursive call
  251.     } else {
  252.       await page.content()
  253.       return true
  254.     }
  255.   }
  256.  
Advertisement
Add Comment
Please, Sign In to add comment