NotSooFriendly94

under line 10

Sep 4th, 2024
28
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.23 KB | None | 0 0
  1. // Function to check if a URL is from the bbcgoodfood.com domain, This ensures that the URLs it is searching are from the bbcgoodfood.com domain.
  2. const isBBCGoodFoodURL = (url) => {
  3. return url.startsWith('https://www.bbcgoodfood.com');
  4. };
  5.  
  6. // Function to scrape data from a URL
  7. const scrapeData = async (url) => {
  8. try {
  9. const response = await fetch(url);
  10. if (!response.ok) {
  11. throw new Error('Network response was not ok'); // If the response from the fetch request is bad, then throw the error. //Condition 1 to be met.
  12. }
  13. const html = await response.text();
  14. const $ = cheerio.load(html);
  15.  
  16. // Scraping recipe title
  17. const recipeTitle = $('h1.heading-1').text().trim();
  18. if (!recipeTitle) {
  19. //console.log(`Skipping ${url}: Recipe title not found`); //Condition 2 to be met.
  20. return null;
  21. }
  22.  
  23. // Scraping additional information
  24. const additionalInfoList = [];
  25. $('ul.recipe__cook-and-prep.list.list--horizontal li').each((index, element) => {
  26. additionalInfoList.push($(element).text().trim());
  27. });
  28. const additionalInfo = additionalInfoList.join(', ');
  29. if (!additionalInfo) {
  30. //console.log(`Skipping ${url}: Additional information not found`); //Condition 3 to be met.
  31. return null;
  32. }
  33.  
  34. // Scraping visible text from the ingredients list section
  35. const ingredientsListContainer = $('section.recipe__ingredients[data-component="IngredientsList"]');
  36. const ingredientsListText = ingredientsListContainer.text().trim();
  37. if (!ingredientsListText) {
  38. //console.log(`Skipping ${url}: Ingredients list not found`); //Condition 4 to be met.
  39. return null;
  40. }
  41.  
  42. // Scraping visible text from the preparation method section
  43. const preparationMethodContainer = $('section.recipe__method-steps[data-placement="MethodList"]');
  44. const preparationMethodText = preparationMethodContainer.text().trim();
  45. if (!preparationMethodText) {
  46. //console.log(`Skipping ${url}: Cooking method not found`); //Condition 5 to be met.
  47. return null;
  48. }
  49.  
  50. // Scraping visible text and images from the first container
  51. const mastheadContainer = $('.container.post-header__container.post-header__container--masthead-layout');
  52. const mastheadImages = mastheadContainer.find('img').map((_, img) => $(img).attr('src').trim()).get();
  53.  
  54. // Saving data to JSON file
  55. const data = {
  56. "Recipe Title": recipeTitle,
  57. "Additional Information": additionalInfo,
  58. "Ingredients List": ingredientsListText,
  59. "Cooking Method": preparationMethodText,
  60. "Recipe Image": mastheadImages.length > 0 ? mastheadImages[0] : ""
  61. };
  62. //console.log('New Recipe found and saved!');
  63. recipeCount++;
  64. //console.log(recipeCount + ' Recipes Retrieved');
  65. return data;
  66. } catch (error) {
  67. //console.error(`Error scraping data from ${url}:`, error);
  68. return null;
  69. }
  70. };
  71.  
  72. // Function to recursively crawl URLs and scrape data
  73. const crawlAndScrape = async (url) => {
  74. try {
  75. if (!isBBCGoodFoodURL(url) || scrapedURLs.has(url)) {
  76. return;
  77. }
  78.  
  79. scrapedURLs.add(url);
  80.  
  81. // Log the current URL being explored
  82. console.log(`Exploring URL: ${url}`);
  83.  
  84. const data = await scrapeData(url);
  85. if (data) {
  86. fs.appendFileSync(outputLocation, JSON.stringify(data, null, 2) + ',\n', 'utf8');
  87. }
  88.  
  89. const response = await fetch(url);
  90. if (!response.ok) {
  91. throw new Error('Network response was not ok');
  92. }
  93. const html = await response.text();
  94. const $ = cheerio.load(html);
  95.  
  96. const links = $('a[href^="https://www.bbcgoodfood.com"]');
  97. links.each((index, element) => {
  98. const childURL = $(element).attr('href');
  99. crawlAndScrape(childURL);
  100. });
  101. } catch (error) {
  102. console.error(`Error crawling and scraping ${url}:`, error);
  103. }
  104. };
  105.  
  106. // Function to stop crawling after 2 minutes
  107. const stopCrawling = () => {
  108. console.log('Crawling stopped after 2 minutes.');
  109. process.exit(0);
  110. };
  111.  
  112. // Start crawling and scraping from the base URL
  113. crawlAndScrape('https://www.bbcgoodfood.com');
  114.  
  115. // Set timeout to stop crawling after 2 minutes
  116. startTime = Date.now();
  117. setTimeout(stopCrawling, 120000); // 2 minutes = 120,000 milliseconds
Advertisement
Add Comment
Please, Sign In to add comment