Advertisement
dereksir

Untitled

Nov 28th, 2023
172
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C# 3.03 KB | None | 0 0
  1. using PuppeteerSharp;
  2. using AngleSharp;
  3.  
  4.  
  5. public class Product
  6. {
  7.     public string? Name { get; set; }
  8.     public string? Price { get; set; }
  9. }
  10.  
  11. class Program
  12. {
  13.     static async Task Main(string[] args)
  14.     {
  15.         // To store the scraped data
  16.         var products = new List<Product>();
  17.  
  18.         // Download the browser executable
  19.         await new BrowserFetcher().DownloadAsync();
  20.  
  21.         // Browser execution configs
  22.         var launchOptions = new LaunchOptions
  23.         {
  24.             Headless = true, // run browser in headless mode
  25.         };
  26.  
  27.         // Open a new page in the controlled browser
  28.         using (var browser = await Puppeteer.LaunchAsync(launchOptions))
  29.         using (var page = await browser.NewPageAsync())
  30.         {
  31.             // Visit the target page
  32.             await page.GoToAsync("https://scrapingclub.com/exercise/list_infinite_scroll/");
  33.  
  34.             // Deal with infinite scrolling
  35.             var jsScrollScript = @"
  36.                const scrolls = 10
  37.                let scrollCount = 0
  38.  
  39.                // Scroll down and then wait for 0.5s
  40.                const scrollInterval = setInterval(() => {
  41.                  window.scrollTo(0, document.body.scrollHeight)
  42.                  scrollCount++
  43.                  if (scrollCount === scrolls) {
  44.                      clearInterval(scrollInterval)
  45.                  }
  46.                }, 500)
  47.            ";
  48.             await page.EvaluateExpressionAsync(jsScrollScript);
  49.  
  50.             // Wait for 10 seconds for the products to load
  51.             await page.WaitForTimeoutAsync(10000);
  52.  
  53.             // Get the fully rendered content after JavaScript rendering
  54.             var contentAfterRender = await page.GetContentAsync();
  55.  
  56.             // Create a new browsing context with AngleSharp
  57.             var context = BrowsingContext.New(Configuration.Default);
  58.            
  59.             // Open a document with the rendered HTML content
  60.             var document = await context.OpenAsync(req => req.Content(contentAfterRender));
  61.  
  62.             // Select all product HTML elements
  63.             var productElements = document.QuerySelectorAll(".post");
  64.  
  65.             // Iterate over them and extract the desired data
  66.             foreach (var productElement in productElements)
  67.             {
  68.                 // Select the name and price elements
  69.                 var nameElement = productElement.QuerySelector("h4");
  70.                 var priceElement = productElement.QuerySelector("h5");
  71.  
  72.                 // Extract their text
  73.                 var name = nameElement?.TextContent ?? "";
  74.                 var price = priceElement?.TextContent ?? "";
  75.  
  76.                 // Instantiate a new product and add it to the list
  77.                 var product = new Product { Name = name, Price = price };
  78.                 products.Add(product);
  79.             }
  80.         }
  81.  
  82.         // Display the scraped data
  83.         foreach (var product in products)
  84.         {
  85.             Console.WriteLine($"Name: {product.Name} | Price: {product.Price}");
  86.         }
  87.     }
  88. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement