Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import { PuppeteerCrawler, Dataset, Configuration, log, LogLevel, Request, purgeDefaultStorages, ProxyConfiguration } from 'crawlee';
- import { router } from './routes.js';
- import puppeteerExtra from 'puppeteer-extra';
- import stealthPlugin from 'puppeteer-extra-plugin-stealth';
- import adblockerPlugin from 'puppeteer-extra-plugin-adblocker';
- import qs from 'querystring';
- puppeteerExtra.use(stealthPlugin());
- puppeteerExtra.use(adblockerPlugin({ blockTrackersAndAnnoyances: true }));
- log.setLevel(LogLevel.DEBUG);
- process.env['CRAWLEE_PURGE_ON_START'] = 0;
- process.env['CRAWLEE_DEFAULT_KEY_VALUE_STORE_ID'] = 'ECI_Supermercado';
- process.env['CRAWLEE_DEFAULT_REQUEST_QUEUE_ID'] = 'ECI_Supermercado';
- process.env['CRAWLEE_DEFAULT_DATASET_ID'] = 'ECI_Supermercado';
- log.warning('Purging local storage')
- await purgeDefaultStorages();
- const config = Configuration.getGlobalConfig();
- config.set('disableBrowserSandbox', true);
- // config.set('defaultBrowserPath', '/usr/bin/chromium-browser')
- config.set('availableMemoryRatio', 0.95)
- export const proxyConfiguration = new ProxyConfiguration({
- proxyUrls: [],
- });
- export const abortAssets = async ({ page }) => {
- const RESOURCE_EXCLUSIONS = ['image', 'media', 'webp', 'imageset', 'avif', 'svg', 'png', 'gif', 'font'];
- await page.setRequestInterception(true);
- await page.on('request', (request) => {
- if (RESOURCE_EXCLUSIONS.includes(request.resourceType())) {
- return request.abort();
- }
- return request.continue();
- });
- };
- let crawler = new PuppeteerCrawler({
- proxyConfiguration,
- requestHandler: router,
- // headless: false,
- maxConcurrency: 1,
- maxRequestRetries: 30,
- launchContext: {
- launcher: puppeteerExtra,
- launchOptions: {
- args: [
- '--disable-dev-shm-usage',
- ]
- },
- },
- useSessionPool: true,
- failedRequestHandler({ request }) {
- log.debug(`Request ${request.url} failed 30 times.`);
- },
- preNavigationHooks: [
- abortAssets,
- async (_, gotoOptions) => {
- gotoOptions.waitUntil = "domcontentloaded";
- }
- ],
- });
- await crawler.run(['https://www.elcorteingles.es/supermercado/']);
- await Dataset.exportToJSON('ECI_Supermercado_Barcelona', { toKVS: 'Supermercado-Barcelona' });
Advertisement
Add Comment
Please, Sign In to add comment