Good Food Scrape

const fetch = require('cross-fetch');
const fs = require('fs');
const path = require('path');
const cheerio = require('cheerio');
var recipeCount = 0;

const outputLocation = '<C:/Users/KEN/Desktop/YOURFOLDERNAME/Json-Search-Results/recipe_data.json'>; // Output path to where the JSON file will be created and saved.
const scrapedURLs = new Set(); // Set to store URLs from which data has been scraped. This will hold the checked URLs enabling it to skip anything that has already been scraped or checked but not satified the criteria.
let startTime;

// Function to check if a URL is from the bbcgoodfood.com domain, This insures that the URLs it is searchng are from the bbcgoodfood.com domain.
const isBBCGoodFoodURL = (url) => {
  return url.startsWith('https://www.bbcgoodfood.com');
};

// Function to scrape data from a URL
const scrapeData = async (url) => {
  try {
    const response = await fetch(url);
    if (!response.ok) {
      throw new Error('Network response was not ok');  //If the response from the fetch request is bad then throw the error.     //Conditon 1 to be met.
    }
    const html = await response.text();
    const $ = cheerio.load(html);

    // Scraping recipe title
    const recipeTitle = $('h1.heading-1').text().trim();
    if (!recipeTitle) {
      //console.log(`Skipping ${url}: Recipe title not found`); //Conditon 2 to be met.
      return null;
    }

    // Scraping additional information
    const additionalInfoList = [];
    $('ul.recipe__cook-and-prep.list.list--horizontal li').each((index, element) => {
      additionalInfoList.push($(element).text().trim());
    });
    const additionalInfo = additionalInfoList.join(', ');
    if (!additionalInfo) {
      //console.log(`Skipping ${url}: Additional information not found`);     //Conditon 3 to be met.
      return null;
    }

    // Scraping visible text from the ingredients list section
    const ingredientsListContainer = $('section.recipe__ingredients[data-component="IngredientsList"]');
    const ingredientsListText = ingredientsListContainer.text().trim();
    if (!ingredientsListText) {
      //console.log(`Skipping ${url}: Ingredients list not found`);   //Conditon 4 to be met.
      return null;
    }

    // Scraping visible text from the preparation method section
    const preparationMethodContainer = $('section.recipe__method-steps[data-placement="MethodList"]');
    const preparationMethodText = preparationMethodContainer.text().trim();
    if (!preparationMethodText) {
      //console.log(`Skipping ${url}: Cooking method not found`);  //Conditon 5 to be met.
      return null;
    }

    // Scraping visible text and images from the first container
    const mastheadContainer = $('.container.post-header__container.post-header__container--masthead-layout');
    const mastheadImages = mastheadContainer.find('img').map((_, img) => $(img).attr('src').trim()).get();

    // Saving data to JSON file
    const data = {
      "Recipe Title": recipeTitle,
      "Additional Information": additionalInfo,
      "Ingredients List": ingredientsListText,
      "Cooking Method": preparationMethodText,
      "Recipe Image": mastheadImages.length > 0 ? mastheadImages[0] : ""
    };
    //console.log('New Recipe found and saved!');
    recipeCount++;
    console.log(recipeCount + ' Recipes Retrieved');
    return data;
  } catch (error) {
    //console.error(`Error scraping data from ${url}:`, error);
    return null;
  }
};

// Function to recursively crawl URLs and scrape data
const crawlAndScrape = async (url) => {
  try {
    if (!isBBCGoodFoodURL(url) || scrapedURLs.has(url)) {
      return;
    }

    scrapedURLs.add(url);
    const data = await scrapeData(url);
    if (data) {
      fs.appendFileSync(outputLocation, JSON.stringify(data, null, 2) + ',\n', 'utf8');
    }

    const response = await fetch(url);
    if (!response.ok) {
      throw new Error('Network response was not ok');
    }
    const html = await response.text();
    const $ = cheerio.load(html);

    const links = $('a[href^="https://www.bbcgoodfood.com"]');
    links.each((index, element) => {
      const childURL = $(element).attr('href');
      crawlAndScrape(childURL);
    });
  } catch (error) {
    console.error(`Error crawling and scraping ${url}:`, error);
  }
};

// Function to stop crawling after 2 minutes
const stopCrawling = () => {
  console.log('Crawling stopped after 2 minutes.');
  process.exit(0);
};

// Start crawling and scraping from the base URL
crawlAndScrape('https://www.bbcgoodfood.com');

// Set timeout to stop crawling after 2 minutes
startTime = Date.now();
setTimeout(stopCrawling, 120000); // 2 minutes = 120,000 milliseconds