Guest User

Untitled

a guest
Apr 11th, 2023
51
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import { CheerioCrawler, ProxyConfiguration, purgeDefaultStorages, log, LogLevel, Configuration, RequestList, downloadListOfUrls } from 'crawlee';
  2. import { router } from './routes.js';
  3. import https from 'https';
  4. import fs from 'fs';
  5. import zlib from 'zlib';
  6. import XMLParser from 'fast-xml-parser';
  7. import { ApifyStorageLocal } from '@apify/storage-local';
  8.  
  9. const storageLocal = new ApifyStorageLocal();
  10. Configuration.getGlobalConfig().set('storageClient', storageLocal);
  11.  
  12. log.setLevel(LogLevel.DEBUG);
  13. // console.log('Purging local storage')
  14. // await purgeDefaultStorages();
  15. process.env['CRAWLEE_PURGE_ON_START'] = 0;
  16. process.env['CRAWLEE_DEFAULT_KEY_VALUE_STORE_ID'] = 'MyProject';
  17. process.env['CRAWLEE_DEFAULT_REQUEST_QUEUE_ID'] = 'MyProject';
  18. process.env['CRAWLEE_DEFAULT_DATASET_ID'] = 'MyProject';
  19.  
  20. const proxyConfiguration = new ProxyConfiguration({
  21.     proxyUrls: [
  22.         'myproxy',
  23.     ],
  24. });
  25.  
  26. var allUrls = [];
  27. async function downloadSitemaps() {
  28.     let compressed_sitemaps = [];
  29.  
  30.     for (let i = 1; i <= 481; i++) {
  31.       let fileUrl = `https://www.zoro.com/sitemaps/usa/sitemap-product-${i}.xml.gz`;
  32.       compressed_sitemaps.push(fileUrl);
  33.     }
  34.  
  35.     for (const url of compressed_sitemaps) {
  36.       const fileName = url.split('/').pop();
  37.       const writeStream = fs.createWriteStream(fileName);
  38.  
  39.       await new Promise((resolve, reject) => {
  40.         https.get(url, (response) => {
  41.           response
  42.             .pipe(zlib.createGunzip()) // Decompress the .xml.gz file
  43.             .pipe(writeStream)
  44.             .on('finish', () => {
  45.               console.log(`${fileName} downloaded and decompressed.`);
  46.               resolve();
  47.             })
  48.             .on('error', (err) => {
  49.               reject(err);
  50.             });
  51.         });
  52.       });
  53.  
  54.       // Read the decompressed .xml file
  55.       const xmlData = fs.readFileSync(fileName, 'utf-8');
  56.  
  57.       // Transform the XML data to extract the URLs
  58.       const options = {
  59.         attributeNamePrefix: '',
  60.         ignoreAttributes: false,
  61.         parseAttributeValue: true,
  62.       };
  63.       const parser = new XMLParser.XMLParser();
  64.       const jsonData = parser.parse(xmlData, options);
  65.       const urls = extractUrls(jsonData);
  66.  
  67.       // Add the URLs to the main list
  68.       allUrls = allUrls.concat(urls);
  69.  
  70.       console.log(`URLs from ${fileName} added to the main list. Total ${allUrls.length}`);
  71.  
  72.     }
  73.   }
  74.  
  75.   function extractUrls(jsonData) {
  76.     const urls = [];
  77.  
  78.     if (jsonData.urlset) {
  79.       const urlArray = Array.isArray(jsonData.urlset.url) ? jsonData.urlset.url : [jsonData.urlset.url];
  80.       urlArray.forEach((url) => {
  81.         urls.push(url.loc);
  82.       });
  83.     } else if (jsonData.sitemapindex) {
  84.       const sitemapArray = Array.isArray(jsonData.sitemapindex.sitemap) ? jsonData.sitemapindex.sitemap : [jsonData.sitemapindex.sitemap];
  85.       sitemapArray.forEach((sitemap) => {
  86.         const sitemapXml = https.get(sitemap.loc, (response) => {
  87.           let sitemapData = '';
  88.           response.on('data', (chunk) => {
  89.             sitemapData += chunk;
  90.           });
  91.           response.on('end', () => {
  92.             const sitemapJsonData = parser.parse(sitemapData, options);
  93.             const sitemapUrls = extractUrls(sitemapJsonData);
  94.             urls.push(...sitemapUrls);
  95.           });
  96.         });
  97.       });
  98.     }
  99.  
  100.     return urls;
  101.   }
  102.  
  103. async function deleteSitemapFiles() {
  104.   for (let i = 1; i <= 481; i++) {
  105.     const fileName = `sitemap-product-${i}.xml.gz`;
  106.     try {
  107.     await fs.promises.unlink(fileName);
  108.     console.log(`${fileName} deleted.`);
  109.     } catch (err) {
  110.     console.error(`Error deleting ${fileName}: ${err}`);
  111.     }
  112.   }
  113. }
  114.  
  115. await downloadSitemaps()
  116.   .then(deleteSitemapFiles);
  117.  
  118. console.log(`Starting to add ${allUrls.length} urls to the RequestList`)
  119. // const ReqList = new RequestList({
  120. //   sources: allUrls,
  121. //   persistRequestsKey: 'Zoro-ReqList',
  122. //   keepDuplicateUrls: false
  123. // });
  124. // await ReqList.initialize()
  125. // console.log(ReqList.length)
  126.  
  127.  
  128.  
  129. const crawler = new CheerioCrawler({
  130.   proxyConfiguration,
  131.   requestHandler: router,
  132.   minConcurrency: 32,
  133.   maxConcurrency: 256,
  134.   maxRequestRetries: 20,
  135.   navigationTimeoutSecs: 10,
  136.   loggingInterval: 30,
  137.   useSessionPool: true,
  138.   failedRequestHandler({ request }) {
  139.       log.debug(`Request ${request.url} failed 20 times.`);
  140.   },
  141. });
  142.  
  143. let totalCount = allUrls.length
  144. let counter = 0
  145. const chunkSize = 1000000;
  146. for (let i = 0; i < allUrls.length; i += chunkSize) {
  147.     var chunk = allUrls.slice(i, i + chunkSize);
  148.     await crawler.addRequests(chunk);
  149.     console.log(`Added ${chunk.length} to the queue. ${totalCount -= chunk.length} left.`)
  150.     await wait(15000)
  151. }
  152. // for (let pUrl of allUrls) {
  153. //   counter += 1
  154. //   await crawler.addRequests([pUrl]);
  155. //   console.log(`Added ${counter} to the queue. ${totalCount - counter} left`)
  156. // }
  157.  
  158. //await crawler.addRequests(allUrls);
  159. async function wait(ms) {
  160.   return new Promise(resolve => {
  161.     setTimeout(() => {
  162.       resolve();
  163.     }, ms);
  164.   });
  165. }
  166. await crawler.run()
  167.  
Advertisement
Add Comment
Please, Sign In to add comment