Advertisement
Guest User

PasteScraper

a guest
Dec 22nd, 2014
155
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C# 4.98 KB | None | 0 0
  1. /* Do not forget adding a reference to System.Net.Http */
  2.  
  3. using System;
  4. using System.Collections.Concurrent;
  5. using System.Collections.Generic;
  6. using System.Linq;
  7. using System.Net;
  8. using System.Net.Http;
  9. using System.Text.RegularExpressions;
  10. using System.Threading;
  11. using System.Threading.Tasks;
  12.  
  13. namespace PasteScraper.Pastebin
  14. {
  15.     /// <summary>
  16.     /// Utility class for scraping the most recently posted pastes
  17.     /// </summary>
  18.     public class Scraper
  19.     {
  20.         /// <summary>
  21.         /// Archive URL to get the most recently posted pastes from
  22.         /// </summary>
  23.         private const string ArchiveUrl = "http://pastebin.com/archive";
  24.  
  25.         /// <summary>
  26.         /// Sets the default web proxy for HTTP requests to null, so
  27.         /// the system will not try to resolve it (which causes a delay)
  28.         /// </summary>
  29.         static Scraper()
  30.         {
  31.             WebRequest.DefaultWebProxy = null;
  32.         }
  33.  
  34.         /// <summary>
  35.         /// Scrapse the most recently posted pastes. Method returns when all
  36.         /// pastes have been downloaded, or the timeout elapsed
  37.         /// </summary>
  38.         /// <param name="timeout">Timeout for downloading pastes in seconds</param>
  39.         /// <returns>Most recently posted pastes</returns>
  40.         public static async Task<IEnumerable<Paste>> Scrape(int timeout)
  41.         {
  42.             const string urlPattern = ".*?(\"\").*?((?:\\/[\\w\\.\\-]+)+)";
  43.             const int relativeUrlLength = 9;
  44.  
  45.             string pageSource;
  46.  
  47.             using (var client = new HttpClient()) {
  48.                 pageSource = await client.GetStringAsync(new Uri(ArchiveUrl, UriKind.Absolute));
  49.             }
  50.  
  51.             var urls = (from matches in Regex.Matches(pageSource, urlPattern).Cast<Match>()
  52.                         let url = matches.Groups[2].Value
  53.                         where url.Length == relativeUrlLength
  54.                         select url).ToList();
  55.  
  56.             var pastes = new ConcurrentBag<Paste>();
  57.  
  58.             urls.ToList().ForEach(url
  59.                 => ThreadPool.QueueUserWorkItem(async state
  60.                     => pastes.Add(await Paste.ParseFromUrlAsync(url))));
  61.  
  62.             var count = 0;
  63.  
  64.             while (pastes.Count != urls.Count) {
  65.                 await Task.Delay(TimeSpan.FromSeconds(1));
  66.  
  67.                 if (++count == (timeout > 0 ? timeout : 10)) {
  68.                     break;
  69.                 }
  70.             }
  71.  
  72.             return pastes;
  73.         }
  74.     }
  75.  
  76.     /// <summary>
  77.     /// Contains information regarding a parsed paste
  78.     /// </summary>
  79.     public class Paste
  80.     {
  81.         /// <summary>
  82.         /// The paste's content
  83.         /// </summary>
  84.         public string Content { get; private set; }
  85.  
  86.         /// <summary>
  87.         /// The paste's ID
  88.         /// </summary>
  89.         public string Id { get; private set; }
  90.  
  91.         /// <summary>
  92.         /// Creates a new Paste object and initializes the content
  93.         /// </summary>
  94.         /// <param name="content">The paste's content</param>
  95.         /// <param name="id">The paste's ID</param>
  96.         public Paste(string content, string id)
  97.         {
  98.             Content = content;
  99.             Id = id;
  100.         }
  101.  
  102.         /// <summary>
  103.         /// Parses a new PastebinResult object from a paste's HTML source
  104.         /// </summary>
  105.         /// <param name="relativeUrl">A paste's relative URL ("/id")</param>
  106.         /// <returns>New PastebinResult object from a paste's HTML source</returns>
  107.         public static Task<Paste> ParseFromUrlAsync(string relativeUrl)
  108.         {
  109.             const int relativeUrlLength = 9;
  110.  
  111.             //Eagerly validate argument and throw exception if invalid
  112.             if (!relativeUrl.StartsWith("/") || relativeUrl.Length != relativeUrlLength) {
  113.                 throw new ArgumentException("The relative URL passed is invalid.");
  114.             }
  115.  
  116.             //Skip the forward slash to get the ID
  117.             return ParseFromUrlAsyncImpl(relativeUrl.Substring(1));
  118.         }
  119.  
  120.         /// <summary>
  121.         /// Parses a new PastebinResult object from a paste's HTML source
  122.         /// </summary>
  123.         /// <param name="id">A paste's ID</param>
  124.         /// <returns>New PastebinResult object from a paste's HTML source</returns>
  125.         private static async Task<Paste> ParseFromUrlAsyncImpl(string id)
  126.         {
  127.             const string baseUrl = "http://pastebin.com/raw.php?i=";
  128.             string pageSource;
  129.  
  130.             using (var client = new HttpClient()) {
  131.                 pageSource = await client.GetStringAsync(baseUrl + id);
  132.             }
  133.  
  134.             return new Paste(pageSource, id);
  135.         }
  136.  
  137.         /// <summary>
  138.         /// Formats the pastebin result to display the content
  139.         /// </summary>
  140.         /// <returns>Formatted string containing the pastebin
  141.         /// result to display the content</returns>
  142.         public override string ToString()
  143.         {
  144.             return string.Format("{0}:\n{1}", Id, Content);
  145.         }
  146.     }
  147. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement