Advertisement
Guest User

Untitled

a guest
Jul 20th, 2018
64
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C# 4.51 KB | None | 0 0
  1. /// <summary>
  2. /// A default parser that parser sites like: https://www.npr.org/podcasts/510208/car-talk/partials?start=1
  3. /// </summary>
  4. public class EpisodeDataParser : IEpisodeDataParser
  5. {
  6.     #region Regular expressions
  7.  
  8.     /// <summary>
  9.     /// The regular expression to extract the episode number and the title from the
  10.     /// newer format of title - for example: '#1825: Male Moron Syndrome', or 'Car Talk #1029: The Pep's Lincoln'
  11.     /// </summary>
  12.     private static readonly Regex EpisodeTitleWithNumberAndNameRegex = new Regex("^(Car Talk )?#(\\d+): (.*)$");
  13.  
  14.     #endregion
  15.  
  16.     #region IEpisodeDataParser implementation
  17.  
  18.     /// <summary>
  19.     /// Parser the given html and finds all the episodes that are contained within it.
  20.     /// </summary>
  21.     /// <param name="html">The html file.</param>
  22.     /// <returns>The list of all retrieved episodes.</returns>
  23.     public List<EpisodeData> Parse(string html)
  24.     {
  25.         // Create html document
  26.         var document = new HtmlDocument();
  27.         document.LoadHtml(html);
  28.  
  29.         // Find all the episodes. Take the document node
  30.         return document.DocumentNode
  31.             // Get all inner divs
  32.             .Descendants("div")
  33.             // With the item-info class
  34.             .Where(node => node.HasClass("item-info"))
  35.             // Cast each element to an episode
  36.             .Select(episodeNode =>
  37.             {
  38.                 #region Parse date
  39.  
  40.                 // From episode node
  41.                 var dateString = episodeNode
  42.                     // Get the first 'time' element
  43.                     .Descendants("time").First()
  44.                     // And the datetime attribute value
  45.                     .Attributes["datetime"].Value;
  46.  
  47.                 // Parse the date
  48.                 var date = DateTime.ParseExact(dateString, "yyyy-MM-dd", CultureInfo.InvariantCulture);
  49.  
  50.                 #endregion
  51.  
  52.                 #region Parse title and number
  53.  
  54.                 // Find the full title. From the episode node
  55.                 var fullTitle = episodeNode
  56.                     // Get the first h2 element
  57.                     .Descendants("h2").First()
  58.                     // And its inner text
  59.                     .InnerText.Trim();
  60.  
  61.                 // Prepare variables to hold the result
  62.                 var title = (string)null;
  63.                 var number = (int?)null;
  64.  
  65.                 // Try to match the title with the new format
  66.                 var newEpisodeMatch = EpisodeTitleWithNumberAndNameRegex.Match(fullTitle);
  67.  
  68.                 // If it matches...
  69.                 if (newEpisodeMatch.Success)
  70.                 {
  71.                     // Then the first group is the episode number
  72.                     number = int.Parse(newEpisodeMatch.Groups[2].Value);
  73.  
  74.                     // And the second one is the title
  75.                     title = newEpisodeMatch.Groups[3].Value;
  76.                 }
  77.                 // Otherwise...
  78.                 else
  79.                 {
  80.                     // Log the title
  81.                     Console.WriteLine($"Cannot parse this title: {fullTitle}");
  82.  
  83.                     // Assume that this is not a correct episode
  84.                     return null;
  85.                 }
  86.  
  87.                 #endregion
  88.  
  89.                 #region Parse download link
  90.  
  91.                 // From the episode node
  92.                 var link = episodeNode
  93.                     // Take the first li elements
  94.                     .Descendants("li").First()
  95.                     // Take the first a element
  96.                     .Descendants("a").First()
  97.                     // And its href value
  98.                     .Attributes["href"].Value.Trim();
  99.  
  100.                 #endregion
  101.  
  102.                 #region Parse teaser
  103.  
  104.                 // From the episode node
  105.                 var teaser = episodeNode
  106.                     // Take the first p elements
  107.                     .Descendants("p").First()
  108.                     // Take the last child (text)
  109.                     .LastChild.InnerText.Trim();
  110.  
  111.                 #endregion
  112.  
  113.                 // Return the date
  114.                 return new EpisodeData
  115.                 {
  116.                     Date = date,
  117.                     Number = number,
  118.                     Title = title,
  119.                     DownloadLink = link,
  120.                     Teaser = teaser
  121.                 };
  122.             })
  123.             // Take only non-nulls (i.e. sucessfully parsed)
  124.             .Where(episode => episode != null)
  125.             // Cast them to list
  126.             .ToList();
  127.     }
  128.  
  129.     #endregion
  130. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement