Guest User

Untitled

a guest
Jun 13th, 2023
134
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import { PuppeteerCrawler, Dataset, Configuration, log, LogLevel, Request, purgeDefaultStorages, ProxyConfiguration } from 'crawlee';
  2. import { router } from './routes.js';
  3. import puppeteerExtra from 'puppeteer-extra';
  4. import stealthPlugin from 'puppeteer-extra-plugin-stealth';
  5. import adblockerPlugin from 'puppeteer-extra-plugin-adblocker';
  6. import qs from 'querystring';
  7.  
  8. puppeteerExtra.use(stealthPlugin());
  9. puppeteerExtra.use(adblockerPlugin({ blockTrackersAndAnnoyances: true }));
  10.  
  11. log.setLevel(LogLevel.DEBUG);
  12.  
  13. process.env['CRAWLEE_PURGE_ON_START'] = 0;
  14. process.env['CRAWLEE_DEFAULT_KEY_VALUE_STORE_ID'] = 'ECI_Supermercado';
  15. process.env['CRAWLEE_DEFAULT_REQUEST_QUEUE_ID'] = 'ECI_Supermercado';
  16. process.env['CRAWLEE_DEFAULT_DATASET_ID'] = 'ECI_Supermercado';
  17.  
  18. log.warning('Purging local storage')
  19. await purgeDefaultStorages();
  20.  
  21. const config = Configuration.getGlobalConfig();
  22. config.set('disableBrowserSandbox', true);
  23. // config.set('defaultBrowserPath', '/usr/bin/chromium-browser')
  24. config.set('availableMemoryRatio', 0.95)
  25.  
  26. export const proxyConfiguration = new ProxyConfiguration({
  27.     proxyUrls: [],
  28. });
  29. export const abortAssets = async ({ page }) => {
  30.     const RESOURCE_EXCLUSIONS = ['image', 'media', 'webp', 'imageset', 'avif', 'svg', 'png', 'gif', 'font'];
  31.     await page.setRequestInterception(true);
  32.  
  33.     await page.on('request', (request) => {
  34.         if (RESOURCE_EXCLUSIONS.includes(request.resourceType())) {
  35.             return request.abort();
  36.         }
  37.         return request.continue();
  38.     });
  39. };
  40.  
  41. let crawler = new PuppeteerCrawler({
  42.     proxyConfiguration,
  43.     requestHandler: router,
  44.     // headless: false,
  45.     maxConcurrency: 1,
  46.     maxRequestRetries: 30,
  47.     launchContext: {
  48.         launcher: puppeteerExtra,
  49.         launchOptions: {
  50.              args: [
  51.                 '--disable-dev-shm-usage',
  52.               ]
  53.         },
  54.     },
  55.     useSessionPool: true,
  56.  
  57.     failedRequestHandler({ request }) {
  58.         log.debug(`Request ${request.url} failed 30 times.`);
  59.     },
  60.  
  61.     preNavigationHooks: [
  62.         abortAssets,
  63.         async (_, gotoOptions) => {
  64.             gotoOptions.waitUntil = "domcontentloaded";
  65.         }
  66.     ],
  67. });
  68.  
  69.  
  70. await crawler.run(['https://www.elcorteingles.es/supermercado/']);
  71. await Dataset.exportToJSON('ECI_Supermercado_Barcelona', { toKVS: 'Supermercado-Barcelona' });
  72.  
Advertisement
Add Comment
Please, Sign In to add comment