Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import { CheerioCrawler, ProxyConfiguration, purgeDefaultStorages, log, LogLevel, Configuration, RequestList, downloadListOfUrls } from 'crawlee';
- import { router } from './routes.js';
- import https from 'https';
- import fs from 'fs';
- import zlib from 'zlib';
- import XMLParser from 'fast-xml-parser';
- import { ApifyStorageLocal } from '@apify/storage-local';
- const storageLocal = new ApifyStorageLocal();
- Configuration.getGlobalConfig().set('storageClient', storageLocal);
- log.setLevel(LogLevel.DEBUG);
- // console.log('Purging local storage')
- // await purgeDefaultStorages();
- process.env['CRAWLEE_PURGE_ON_START'] = 0;
- process.env['CRAWLEE_DEFAULT_KEY_VALUE_STORE_ID'] = 'MyProject';
- process.env['CRAWLEE_DEFAULT_REQUEST_QUEUE_ID'] = 'MyProject';
- process.env['CRAWLEE_DEFAULT_DATASET_ID'] = 'MyProject';
- const proxyConfiguration = new ProxyConfiguration({
- proxyUrls: [
- 'myproxy',
- ],
- });
- var allUrls = [];
- async function downloadSitemaps() {
- let compressed_sitemaps = [];
- for (let i = 1; i <= 481; i++) {
- let fileUrl = `https://www.zoro.com/sitemaps/usa/sitemap-product-${i}.xml.gz`;
- compressed_sitemaps.push(fileUrl);
- }
- for (const url of compressed_sitemaps) {
- const fileName = url.split('/').pop();
- const writeStream = fs.createWriteStream(fileName);
- await new Promise((resolve, reject) => {
- https.get(url, (response) => {
- response
- .pipe(zlib.createGunzip()) // Decompress the .xml.gz file
- .pipe(writeStream)
- .on('finish', () => {
- console.log(`${fileName} downloaded and decompressed.`);
- resolve();
- })
- .on('error', (err) => {
- reject(err);
- });
- });
- });
- // Read the decompressed .xml file
- const xmlData = fs.readFileSync(fileName, 'utf-8');
- // Transform the XML data to extract the URLs
- const options = {
- attributeNamePrefix: '',
- ignoreAttributes: false,
- parseAttributeValue: true,
- };
- const parser = new XMLParser.XMLParser();
- const jsonData = parser.parse(xmlData, options);
- const urls = extractUrls(jsonData);
- // Add the URLs to the main list
- allUrls = allUrls.concat(urls);
- console.log(`URLs from ${fileName} added to the main list. Total ${allUrls.length}`);
- }
- }
- function extractUrls(jsonData) {
- const urls = [];
- if (jsonData.urlset) {
- const urlArray = Array.isArray(jsonData.urlset.url) ? jsonData.urlset.url : [jsonData.urlset.url];
- urlArray.forEach((url) => {
- urls.push(url.loc);
- });
- } else if (jsonData.sitemapindex) {
- const sitemapArray = Array.isArray(jsonData.sitemapindex.sitemap) ? jsonData.sitemapindex.sitemap : [jsonData.sitemapindex.sitemap];
- sitemapArray.forEach((sitemap) => {
- const sitemapXml = https.get(sitemap.loc, (response) => {
- let sitemapData = '';
- response.on('data', (chunk) => {
- sitemapData += chunk;
- });
- response.on('end', () => {
- const sitemapJsonData = parser.parse(sitemapData, options);
- const sitemapUrls = extractUrls(sitemapJsonData);
- urls.push(...sitemapUrls);
- });
- });
- });
- }
- return urls;
- }
- async function deleteSitemapFiles() {
- for (let i = 1; i <= 481; i++) {
- const fileName = `sitemap-product-${i}.xml.gz`;
- try {
- await fs.promises.unlink(fileName);
- console.log(`${fileName} deleted.`);
- } catch (err) {
- console.error(`Error deleting ${fileName}: ${err}`);
- }
- }
- }
- await downloadSitemaps()
- .then(deleteSitemapFiles);
- console.log(`Starting to add ${allUrls.length} urls to the RequestList`)
- // const ReqList = new RequestList({
- // sources: allUrls,
- // persistRequestsKey: 'Zoro-ReqList',
- // keepDuplicateUrls: false
- // });
- // await ReqList.initialize()
- // console.log(ReqList.length)
- const crawler = new CheerioCrawler({
- proxyConfiguration,
- requestHandler: router,
- minConcurrency: 32,
- maxConcurrency: 256,
- maxRequestRetries: 20,
- navigationTimeoutSecs: 10,
- loggingInterval: 30,
- useSessionPool: true,
- failedRequestHandler({ request }) {
- log.debug(`Request ${request.url} failed 20 times.`);
- },
- });
- let totalCount = allUrls.length
- let counter = 0
- const chunkSize = 1000000;
- for (let i = 0; i < allUrls.length; i += chunkSize) {
- var chunk = allUrls.slice(i, i + chunkSize);
- await crawler.addRequests(chunk);
- console.log(`Added ${chunk.length} to the queue. ${totalCount -= chunk.length} left.`)
- await wait(15000)
- }
- // for (let pUrl of allUrls) {
- // counter += 1
- // await crawler.addRequests([pUrl]);
- // console.log(`Added ${counter} to the queue. ${totalCount - counter} left`)
- // }
- //await crawler.addRequests(allUrls);
- async function wait(ms) {
- return new Promise(resolve => {
- setTimeout(() => {
- resolve();
- }, ms);
- });
- }
- await crawler.run()
Advertisement
Add Comment
Please, Sign In to add comment