Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- const fetch = require('cross-fetch');
- const fs = require('fs');
- const path = require('path');
- const cheerio = require('cheerio');
- var recipeCount = 0;
- const outputLocation = '<C:/Users/KEN/Desktop/YOURFOLDERNAME/Json-Search-Results/recipe_data.json'>; // Output path to where the JSON file will be created and saved.
- const scrapedURLs = new Set(); // Set to store URLs from which data has been scraped. This will hold the checked URLs enabling it to skip anything that has already been scraped or checked but not satified the criteria.
- let startTime;
- // Function to check if a URL is from the bbcgoodfood.com domain, This insures that the URLs it is searchng are from the bbcgoodfood.com domain.
- const isBBCGoodFoodURL = (url) => {
- return url.startsWith('https://www.bbcgoodfood.com');
- };
- // Function to scrape data from a URL
- const scrapeData = async (url) => {
- try {
- const response = await fetch(url);
- if (!response.ok) {
- throw new Error('Network response was not ok'); //If the response from the fetch request is bad then throw the error. //Conditon 1 to be met.
- }
- const html = await response.text();
- const $ = cheerio.load(html);
- // Scraping recipe title
- const recipeTitle = $('h1.heading-1').text().trim();
- if (!recipeTitle) {
- //console.log(`Skipping ${url}: Recipe title not found`); //Conditon 2 to be met.
- return null;
- }
- // Scraping additional information
- const additionalInfoList = [];
- $('ul.recipe__cook-and-prep.list.list--horizontal li').each((index, element) => {
- additionalInfoList.push($(element).text().trim());
- });
- const additionalInfo = additionalInfoList.join(', ');
- if (!additionalInfo) {
- //console.log(`Skipping ${url}: Additional information not found`); //Conditon 3 to be met.
- return null;
- }
- // Scraping visible text from the ingredients list section
- const ingredientsListContainer = $('section.recipe__ingredients[data-component="IngredientsList"]');
- const ingredientsListText = ingredientsListContainer.text().trim();
- if (!ingredientsListText) {
- //console.log(`Skipping ${url}: Ingredients list not found`); //Conditon 4 to be met.
- return null;
- }
- // Scraping visible text from the preparation method section
- const preparationMethodContainer = $('section.recipe__method-steps[data-placement="MethodList"]');
- const preparationMethodText = preparationMethodContainer.text().trim();
- if (!preparationMethodText) {
- //console.log(`Skipping ${url}: Cooking method not found`); //Conditon 5 to be met.
- return null;
- }
- // Scraping visible text and images from the first container
- const mastheadContainer = $('.container.post-header__container.post-header__container--masthead-layout');
- const mastheadImages = mastheadContainer.find('img').map((_, img) => $(img).attr('src').trim()).get();
- // Saving data to JSON file
- const data = {
- "Recipe Title": recipeTitle,
- "Additional Information": additionalInfo,
- "Ingredients List": ingredientsListText,
- "Cooking Method": preparationMethodText,
- "Recipe Image": mastheadImages.length > 0 ? mastheadImages[0] : ""
- };
- //console.log('New Recipe found and saved!');
- recipeCount++;
- console.log(recipeCount + ' Recipes Retrieved');
- return data;
- } catch (error) {
- //console.error(`Error scraping data from ${url}:`, error);
- return null;
- }
- };
- // Function to recursively crawl URLs and scrape data
- const crawlAndScrape = async (url) => {
- try {
- if (!isBBCGoodFoodURL(url) || scrapedURLs.has(url)) {
- return;
- }
- scrapedURLs.add(url);
- const data = await scrapeData(url);
- if (data) {
- fs.appendFileSync(outputLocation, JSON.stringify(data, null, 2) + ',\n', 'utf8');
- }
- const response = await fetch(url);
- if (!response.ok) {
- throw new Error('Network response was not ok');
- }
- const html = await response.text();
- const $ = cheerio.load(html);
- const links = $('a[href^="https://www.bbcgoodfood.com"]');
- links.each((index, element) => {
- const childURL = $(element).attr('href');
- crawlAndScrape(childURL);
- });
- } catch (error) {
- console.error(`Error crawling and scraping ${url}:`, error);
- }
- };
- // Function to stop crawling after 2 minutes
- const stopCrawling = () => {
- console.log('Crawling stopped after 2 minutes.');
- process.exit(0);
- };
- // Start crawling and scraping from the base URL
- crawlAndScrape('https://www.bbcgoodfood.com');
- // Set timeout to stop crawling after 2 minutes
- startTime = Date.now();
- setTimeout(stopCrawling, 120000); // 2 minutes = 120,000 milliseconds
Advertisement
Add Comment
Please, Sign In to add comment