Guest User

Untitled

a guest
Apr 10th, 2023
110
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. async function downloadSitemaps() {
  2.     let compressed_sitemaps = [];
  3.  
  4.     // this function needs to receive the compressed sitemaps urls
  5.  
  6.     for (const url of compressed_sitemaps) {
  7.       const fileName = url.split('/').pop();
  8.       const writeStream = fs.createWriteStream(fileName);
  9.  
  10.       await new Promise((resolve, reject) => {
  11.         https.get(url, (response) => {
  12.           response
  13.             .pipe(zlib.createGunzip()) // Decompress the .xml.gz file
  14.             .pipe(writeStream)
  15.             .on('finish', () => {
  16.               console.log(`${fileName} downloaded and decompressed.`);
  17.               resolve();
  18.             })
  19.             .on('error', (err) => {
  20.               reject(err);
  21.             });
  22.         });
  23.       });
  24.  
  25.       // Read the decompressed .xml file
  26.       const xmlData = fs.readFileSync(fileName, 'utf-8');
  27.  
  28.       // Transform the XML data to extract the URLs
  29.       const options = {
  30.         attributeNamePrefix: '',
  31.         ignoreAttributes: false,
  32.         parseAttributeValue: true,
  33.       };
  34.       const parser = new XMLParser.XMLParser();
  35.       const jsonData = parser.parse(xmlData, options);
  36.       const urls = extractUrls(jsonData);
  37.  
  38.       // Add the URLs to the main list
  39.       allUrls = allUrls.concat(urls);
  40.  
  41.       console.log(`URLs from ${fileName} added to the main list. Total ${allUrls.length}`);
  42.  
  43.     }
  44.   }
  45.  
  46.   function extractUrls(jsonData) {
  47.     const urls = [];
  48.  
  49.     if (jsonData.urlset) {
  50.       const urlArray = Array.isArray(jsonData.urlset.url) ? jsonData.urlset.url : [jsonData.urlset.url];
  51.       urlArray.forEach((url) => {
  52.         urls.push(url.loc);
  53.       });
  54.     } else if (jsonData.sitemapindex) {
  55.       const sitemapArray = Array.isArray(jsonData.sitemapindex.sitemap) ? jsonData.sitemapindex.sitemap : [jsonData.sitemapindex.sitemap];
  56.       sitemapArray.forEach((sitemap) => {
  57.         const sitemapXml = https.get(sitemap.loc, (response) => {
  58.           let sitemapData = '';
  59.           response.on('data', (chunk) => {
  60.             sitemapData += chunk;
  61.           });
  62.           response.on('end', () => {
  63.             const sitemapJsonData = parser.parse(sitemapData, options);
  64.             const sitemapUrls = extractUrls(sitemapJsonData);
  65.             urls.push(...sitemapUrls);
  66.           });
  67.         });
  68.       });
  69.     }
  70.  
  71.     return urls;
  72.   }
  73.  
  74. async function deleteSitemapFiles() {
  75.   // this function needs to receive the names from the function above
  76.     try {
  77.     await fs.promises.unlink(fileName);
  78.     console.log(`${fileName} deleted.`);
  79.     } catch (err) {
  80.     console.error(`Error deleting ${fileName}: ${err}`);
  81.     }
  82.   }
  83. }
  84.  
  85. await downloadSitemaps()
  86.   .then(deleteSitemapFiles);
Advertisement
Add Comment
Please, Sign In to add comment