NotSooFriendly94

Good Food Scrape

Sep 4th, 2024
62
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
JavaScript 4.70 KB | Source Code | 0 0
  1. const fetch = require('cross-fetch');
  2. const fs = require('fs');
  3. const path = require('path');
  4. const cheerio = require('cheerio');
  5. var recipeCount = 0;
  6.  
  7. const outputLocation = '<C:/Users/KEN/Desktop/YOURFOLDERNAME/Json-Search-Results/recipe_data.json'>; // Output path to where the JSON file will be created and saved.
  8. const scrapedURLs = new Set(); // Set to store URLs from which data has been scraped. This will hold the checked URLs enabling it to skip anything that has already been scraped or checked but not satified the criteria.
  9. let startTime;
  10.  
  11. // Function to check if a URL is from the bbcgoodfood.com domain, This insures that the URLs it is searchng are from the bbcgoodfood.com domain.
  12. const isBBCGoodFoodURL = (url) => {
  13.   return url.startsWith('https://www.bbcgoodfood.com');
  14. };
  15.  
  16. // Function to scrape data from a URL
  17. const scrapeData = async (url) => {
  18.   try {
  19.     const response = await fetch(url);
  20.     if (!response.ok) {
  21.       throw new Error('Network response was not ok');  //If the response from the fetch request is bad then throw the error.     //Conditon 1 to be met.
  22.     }
  23.     const html = await response.text();
  24.     const $ = cheerio.load(html);
  25.  
  26.     // Scraping recipe title
  27.     const recipeTitle = $('h1.heading-1').text().trim();
  28.     if (!recipeTitle) {
  29.       //console.log(`Skipping ${url}: Recipe title not found`); //Conditon 2 to be met.
  30.       return null;
  31.     }
  32.  
  33.     // Scraping additional information
  34.     const additionalInfoList = [];
  35.     $('ul.recipe__cook-and-prep.list.list--horizontal li').each((index, element) => {
  36.       additionalInfoList.push($(element).text().trim());
  37.     });
  38.     const additionalInfo = additionalInfoList.join(', ');
  39.     if (!additionalInfo) {
  40.       //console.log(`Skipping ${url}: Additional information not found`);     //Conditon 3 to be met.
  41.       return null;
  42.     }
  43.  
  44.     // Scraping visible text from the ingredients list section
  45.     const ingredientsListContainer = $('section.recipe__ingredients[data-component="IngredientsList"]');
  46.     const ingredientsListText = ingredientsListContainer.text().trim();
  47.     if (!ingredientsListText) {
  48.       //console.log(`Skipping ${url}: Ingredients list not found`);   //Conditon 4 to be met.
  49.       return null;
  50.     }
  51.  
  52.     // Scraping visible text from the preparation method section
  53.     const preparationMethodContainer = $('section.recipe__method-steps[data-placement="MethodList"]');
  54.     const preparationMethodText = preparationMethodContainer.text().trim();
  55.     if (!preparationMethodText) {
  56.       //console.log(`Skipping ${url}: Cooking method not found`);  //Conditon 5 to be met.
  57.       return null;
  58.     }
  59.  
  60.     // Scraping visible text and images from the first container
  61.     const mastheadContainer = $('.container.post-header__container.post-header__container--masthead-layout');
  62.     const mastheadImages = mastheadContainer.find('img').map((_, img) => $(img).attr('src').trim()).get();
  63.  
  64.     // Saving data to JSON file
  65.     const data = {
  66.       "Recipe Title": recipeTitle,
  67.       "Additional Information": additionalInfo,
  68.       "Ingredients List": ingredientsListText,
  69.       "Cooking Method": preparationMethodText,
  70.       "Recipe Image": mastheadImages.length > 0 ? mastheadImages[0] : ""
  71.     };
  72.     //console.log('New Recipe found and saved!');
  73.     recipeCount++;
  74.     console.log(recipeCount + ' Recipes Retrieved');
  75.     return data;
  76.   } catch (error) {
  77.     //console.error(`Error scraping data from ${url}:`, error);
  78.     return null;
  79.   }
  80. };
  81.  
  82. // Function to recursively crawl URLs and scrape data
  83. const crawlAndScrape = async (url) => {
  84.   try {
  85.     if (!isBBCGoodFoodURL(url) || scrapedURLs.has(url)) {
  86.       return;
  87.     }
  88.  
  89.     scrapedURLs.add(url);
  90.     const data = await scrapeData(url);
  91.     if (data) {
  92.       fs.appendFileSync(outputLocation, JSON.stringify(data, null, 2) + ',\n', 'utf8');
  93.     }
  94.  
  95.     const response = await fetch(url);
  96.     if (!response.ok) {
  97.       throw new Error('Network response was not ok');
  98.     }
  99.     const html = await response.text();
  100.     const $ = cheerio.load(html);
  101.  
  102.     const links = $('a[href^="https://www.bbcgoodfood.com"]');
  103.     links.each((index, element) => {
  104.       const childURL = $(element).attr('href');
  105.       crawlAndScrape(childURL);
  106.     });
  107.   } catch (error) {
  108.     console.error(`Error crawling and scraping ${url}:`, error);
  109.   }
  110. };
  111.  
  112. // Function to stop crawling after 2 minutes
  113. const stopCrawling = () => {
  114.   console.log('Crawling stopped after 2 minutes.');
  115.   process.exit(0);
  116. };
  117.  
  118. // Start crawling and scraping from the base URL
  119. crawlAndScrape('https://www.bbcgoodfood.com');
  120.  
  121. // Set timeout to stop crawling after 2 minutes
  122. startTime = Date.now();
  123. setTimeout(stopCrawling, 120000); // 2 minutes = 120,000 milliseconds
  124.  
Advertisement
Add Comment
Please, Sign In to add comment