Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- /// <summary>
- /// A default parser that parser sites like: https://www.npr.org/podcasts/510208/car-talk/partials?start=1
- /// </summary>
- public class EpisodeDataParser : IEpisodeDataParser
- {
- #region Regular expressions
- /// <summary>
- /// The regular expression to extract the episode number and the title from the
- /// newer format of title - for example: '#1825: Male Moron Syndrome', or 'Car Talk #1029: The Pep's Lincoln'
- /// </summary>
- private static readonly Regex EpisodeTitleWithNumberAndNameRegex = new Regex("^(Car Talk )?#(\\d+): (.*)$");
- #endregion
- #region IEpisodeDataParser implementation
- /// <summary>
- /// Parser the given html and finds all the episodes that are contained within it.
- /// </summary>
- /// <param name="html">The html file.</param>
- /// <returns>The list of all retrieved episodes.</returns>
- public List<EpisodeData> Parse(string html)
- {
- // Create html document
- var document = new HtmlDocument();
- document.LoadHtml(html);
- // Find all the episodes. Take the document node
- return document.DocumentNode
- // Get all inner divs
- .Descendants("div")
- // With the item-info class
- .Where(node => node.HasClass("item-info"))
- // Cast each element to an episode
- .Select(episodeNode =>
- {
- #region Parse date
- // From episode node
- var dateString = episodeNode
- // Get the first 'time' element
- .Descendants("time").First()
- // And the datetime attribute value
- .Attributes["datetime"].Value;
- // Parse the date
- var date = DateTime.ParseExact(dateString, "yyyy-MM-dd", CultureInfo.InvariantCulture);
- #endregion
- #region Parse title and number
- // Find the full title. From the episode node
- var fullTitle = episodeNode
- // Get the first h2 element
- .Descendants("h2").First()
- // And its inner text
- .InnerText.Trim();
- // Prepare variables to hold the result
- var title = (string)null;
- var number = (int?)null;
- // Try to match the title with the new format
- var newEpisodeMatch = EpisodeTitleWithNumberAndNameRegex.Match(fullTitle);
- // If it matches...
- if (newEpisodeMatch.Success)
- {
- // Then the first group is the episode number
- number = int.Parse(newEpisodeMatch.Groups[2].Value);
- // And the second one is the title
- title = newEpisodeMatch.Groups[3].Value;
- }
- // Otherwise...
- else
- {
- // Log the title
- Console.WriteLine($"Cannot parse this title: {fullTitle}");
- // Assume that this is not a correct episode
- return null;
- }
- #endregion
- #region Parse download link
- // From the episode node
- var link = episodeNode
- // Take the first li elements
- .Descendants("li").First()
- // Take the first a element
- .Descendants("a").First()
- // And its href value
- .Attributes["href"].Value.Trim();
- #endregion
- #region Parse teaser
- // From the episode node
- var teaser = episodeNode
- // Take the first p elements
- .Descendants("p").First()
- // Take the last child (text)
- .LastChild.InnerText.Trim();
- #endregion
- // Return the date
- return new EpisodeData
- {
- Date = date,
- Number = number,
- Title = title,
- DownloadLink = link,
- Teaser = teaser
- };
- })
- // Take only non-nulls (i.e. sucessfully parsed)
- .Where(episode => episode != null)
- // Cast them to list
- .ToList();
- }
- #endregion
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement