Advertisement
Guest User

PasteScraper

a guest
Dec 22nd, 2014
204
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C# 5.02 KB | None | 0 0
  1. /* Do not forget adding a reference to System.Net.Http */
  2. /* Credits: Dev @ InspireCoding */
  3.  
  4. using System;
  5. using System.Collections.Concurrent;
  6. using System.Collections.Generic;
  7. using System.Linq;
  8. using System.Net;
  9. using System.Net.Http;
  10. using System.Text.RegularExpressions;
  11. using System.Threading;
  12. using System.Threading.Tasks;
  13.  
  14. namespace PasteScraper.Pastebin
  15. {
  16.     /// <summary>
  17.     /// Utility class for scraping the most recently posted pastes
  18.     /// </summary>
  19.     public class Scraper
  20.     {
  21.         /// <summary>
  22.         /// Archive URL to get the most recently posted pastes from
  23.         /// </summary>
  24.         private const string ArchiveUrl = "http://pastebin.com/archive";
  25.  
  26.         /// <summary>
  27.         /// Sets the default web proxy for HTTP requests to null, so
  28.         /// the system will not try to resolve it (which causes a delay)
  29.         /// </summary>
  30.         static Scraper()
  31.         {
  32.             WebRequest.DefaultWebProxy = null;
  33.         }
  34.  
  35.         /// <summary>
  36.         /// Scrapse the most recently posted pastes. Method returns when all
  37.         /// pastes have been downloaded, or the timeout elapsed
  38.         /// </summary>
  39.         /// <param name="timeout">Timeout for downloading pastes in seconds</param>
  40.         /// <returns>Most recently posted pastes</returns>
  41.         public static async Task<IEnumerable<Paste>> Scrape(int timeout)
  42.         {
  43.             const string urlPattern = ".*?(\"\").*?((?:\\/[\\w\\.\\-]+)+)";
  44.             const int relativeUrlLength = 9;
  45.  
  46.             string pageSource;
  47.  
  48.             using (var client = new HttpClient()) {
  49.                 pageSource = await client.GetStringAsync(new Uri(ArchiveUrl, UriKind.Absolute));
  50.             }
  51.  
  52.             var urls = (from matches in Regex.Matches(pageSource, urlPattern).Cast<Match>()
  53.                         let url = matches.Groups[2].Value
  54.                         where url.Length == relativeUrlLength
  55.                         select url).ToList();
  56.  
  57.             var pastes = new ConcurrentBag<Paste>();
  58.  
  59.             urls.ToList().ForEach(url
  60.                 => ThreadPool.QueueUserWorkItem(async state
  61.                     => pastes.Add(await Paste.ParseFromUrlAsync(url))));
  62.  
  63.             var count = 0;
  64.  
  65.             while (pastes.Count != urls.Count) {
  66.                 await Task.Delay(TimeSpan.FromSeconds(1));
  67.  
  68.                 if (++count == (timeout > 0 ? timeout : 10)) {
  69.                     break;
  70.                 }
  71.             }
  72.  
  73.             return pastes;
  74.         }
  75.     }
  76.  
  77.     /// <summary>
  78.     /// Contains information regarding a parsed paste
  79.     /// </summary>
  80.     public class Paste
  81.     {
  82.         /// <summary>
  83.         /// The paste's content
  84.         /// </summary>
  85.         public string Content { get; private set; }
  86.  
  87.         /// <summary>
  88.         /// The paste's ID
  89.         /// </summary>
  90.         public string Id { get; private set; }
  91.  
  92.         /// <summary>
  93.         /// Creates a new Paste object and initializes the content
  94.         /// </summary>
  95.         /// <param name="content">The paste's content</param>
  96.         /// <param name="id">The paste's ID</param>
  97.         public Paste(string content, string id)
  98.         {
  99.             Content = content;
  100.             Id = id;
  101.         }
  102.  
  103.         /// <summary>
  104.         /// Parses a new PastebinResult object from a paste's HTML source
  105.         /// </summary>
  106.         /// <param name="relativeUrl">A paste's relative URL ("/id")</param>
  107.         /// <returns>New PastebinResult object from a paste's HTML source</returns>
  108.         public static Task<Paste> ParseFromUrlAsync(string relativeUrl)
  109.         {
  110.             const int relativeUrlLength = 9;
  111.  
  112.             //Eagerly validate argument and throw exception if invalid
  113.             if (!relativeUrl.StartsWith("/") || relativeUrl.Length != relativeUrlLength) {
  114.                 throw new ArgumentException("The relative URL passed is invalid.");
  115.             }
  116.  
  117.             //Skip the forward slash to get the ID
  118.             return ParseFromUrlAsyncImpl(relativeUrl.Substring(1));
  119.         }
  120.  
  121.         /// <summary>
  122.         /// Parses a new PastebinResult object from a paste's HTML source
  123.         /// </summary>
  124.         /// <param name="id">A paste's ID</param>
  125.         /// <returns>New PastebinResult object from a paste's HTML source</returns>
  126.         private static async Task<Paste> ParseFromUrlAsyncImpl(string id)
  127.         {
  128.             const string baseUrl = "http://pastebin.com/raw.php?i=";
  129.             string pageSource;
  130.  
  131.             using (var client = new HttpClient()) {
  132.                 pageSource = await client.GetStringAsync(baseUrl + id);
  133.             }
  134.  
  135.             return new Paste(pageSource, id);
  136.         }
  137.  
  138.         /// <summary>
  139.         /// Formats the pastebin result to display the content
  140.         /// </summary>
  141.         /// <returns>Formatted string containing the pastebin
  142.         /// result to display the content</returns>
  143.         public override string ToString()
  144.         {
  145.             return string.Format("{0}:\n{1}", Id, Content);
  146.         }
  147.     }
  148. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement