Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- // Function to check if a URL is from the bbcgoodfood.com domain, This ensures that the URLs it is searching are from the bbcgoodfood.com domain.
- const isBBCGoodFoodURL = (url) => {
- return url.startsWith('https://www.bbcgoodfood.com');
- };
- // Function to scrape data from a URL
- const scrapeData = async (url) => {
- try {
- const response = await fetch(url);
- if (!response.ok) {
- throw new Error('Network response was not ok'); // If the response from the fetch request is bad, then throw the error. //Condition 1 to be met.
- }
- const html = await response.text();
- const $ = cheerio.load(html);
- // Scraping recipe title
- const recipeTitle = $('h1.heading-1').text().trim();
- if (!recipeTitle) {
- //console.log(`Skipping ${url}: Recipe title not found`); //Condition 2 to be met.
- return null;
- }
- // Scraping additional information
- const additionalInfoList = [];
- $('ul.recipe__cook-and-prep.list.list--horizontal li').each((index, element) => {
- additionalInfoList.push($(element).text().trim());
- });
- const additionalInfo = additionalInfoList.join(', ');
- if (!additionalInfo) {
- //console.log(`Skipping ${url}: Additional information not found`); //Condition 3 to be met.
- return null;
- }
- // Scraping visible text from the ingredients list section
- const ingredientsListContainer = $('section.recipe__ingredients[data-component="IngredientsList"]');
- const ingredientsListText = ingredientsListContainer.text().trim();
- if (!ingredientsListText) {
- //console.log(`Skipping ${url}: Ingredients list not found`); //Condition 4 to be met.
- return null;
- }
- // Scraping visible text from the preparation method section
- const preparationMethodContainer = $('section.recipe__method-steps[data-placement="MethodList"]');
- const preparationMethodText = preparationMethodContainer.text().trim();
- if (!preparationMethodText) {
- //console.log(`Skipping ${url}: Cooking method not found`); //Condition 5 to be met.
- return null;
- }
- // Scraping visible text and images from the first container
- const mastheadContainer = $('.container.post-header__container.post-header__container--masthead-layout');
- const mastheadImages = mastheadContainer.find('img').map((_, img) => $(img).attr('src').trim()).get();
- // Saving data to JSON file
- const data = {
- "Recipe Title": recipeTitle,
- "Additional Information": additionalInfo,
- "Ingredients List": ingredientsListText,
- "Cooking Method": preparationMethodText,
- "Recipe Image": mastheadImages.length > 0 ? mastheadImages[0] : ""
- };
- //console.log('New Recipe found and saved!');
- recipeCount++;
- //console.log(recipeCount + ' Recipes Retrieved');
- return data;
- } catch (error) {
- //console.error(`Error scraping data from ${url}:`, error);
- return null;
- }
- };
- // Function to recursively crawl URLs and scrape data
- const crawlAndScrape = async (url) => {
- try {
- if (!isBBCGoodFoodURL(url) || scrapedURLs.has(url)) {
- return;
- }
- scrapedURLs.add(url);
- // Log the current URL being explored
- console.log(`Exploring URL: ${url}`);
- const data = await scrapeData(url);
- if (data) {
- fs.appendFileSync(outputLocation, JSON.stringify(data, null, 2) + ',\n', 'utf8');
- }
- const response = await fetch(url);
- if (!response.ok) {
- throw new Error('Network response was not ok');
- }
- const html = await response.text();
- const $ = cheerio.load(html);
- const links = $('a[href^="https://www.bbcgoodfood.com"]');
- links.each((index, element) => {
- const childURL = $(element).attr('href');
- crawlAndScrape(childURL);
- });
- } catch (error) {
- console.error(`Error crawling and scraping ${url}:`, error);
- }
- };
- // Function to stop crawling after 2 minutes
- const stopCrawling = () => {
- console.log('Crawling stopped after 2 minutes.');
- process.exit(0);
- };
- // Start crawling and scraping from the base URL
- crawlAndScrape('https://www.bbcgoodfood.com');
- // Set timeout to stop crawling after 2 minutes
- startTime = Date.now();
- setTimeout(stopCrawling, 120000); // 2 minutes = 120,000 milliseconds
Advertisement
Add Comment
Please, Sign In to add comment