Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- async function downloadSitemaps() {
- let compressed_sitemaps = [];
- // this function needs to receive the compressed sitemaps urls
- for (const url of compressed_sitemaps) {
- const fileName = url.split('/').pop();
- const writeStream = fs.createWriteStream(fileName);
- await new Promise((resolve, reject) => {
- https.get(url, (response) => {
- response
- .pipe(zlib.createGunzip()) // Decompress the .xml.gz file
- .pipe(writeStream)
- .on('finish', () => {
- console.log(`${fileName} downloaded and decompressed.`);
- resolve();
- })
- .on('error', (err) => {
- reject(err);
- });
- });
- });
- // Read the decompressed .xml file
- const xmlData = fs.readFileSync(fileName, 'utf-8');
- // Transform the XML data to extract the URLs
- const options = {
- attributeNamePrefix: '',
- ignoreAttributes: false,
- parseAttributeValue: true,
- };
- const parser = new XMLParser.XMLParser();
- const jsonData = parser.parse(xmlData, options);
- const urls = extractUrls(jsonData);
- // Add the URLs to the main list
- allUrls = allUrls.concat(urls);
- console.log(`URLs from ${fileName} added to the main list. Total ${allUrls.length}`);
- }
- }
- function extractUrls(jsonData) {
- const urls = [];
- if (jsonData.urlset) {
- const urlArray = Array.isArray(jsonData.urlset.url) ? jsonData.urlset.url : [jsonData.urlset.url];
- urlArray.forEach((url) => {
- urls.push(url.loc);
- });
- } else if (jsonData.sitemapindex) {
- const sitemapArray = Array.isArray(jsonData.sitemapindex.sitemap) ? jsonData.sitemapindex.sitemap : [jsonData.sitemapindex.sitemap];
- sitemapArray.forEach((sitemap) => {
- const sitemapXml = https.get(sitemap.loc, (response) => {
- let sitemapData = '';
- response.on('data', (chunk) => {
- sitemapData += chunk;
- });
- response.on('end', () => {
- const sitemapJsonData = parser.parse(sitemapData, options);
- const sitemapUrls = extractUrls(sitemapJsonData);
- urls.push(...sitemapUrls);
- });
- });
- });
- }
- return urls;
- }
- async function deleteSitemapFiles() {
- // this function needs to receive the names from the function above
- try {
- await fs.promises.unlink(fileName);
- console.log(`${fileName} deleted.`);
- } catch (err) {
- console.error(`Error deleting ${fileName}: ${err}`);
- }
- }
- }
- await downloadSitemaps()
- .then(deleteSitemapFiles);
Advertisement
Add Comment
Please, Sign In to add comment