Advertisement
LegionMammal978

dictionary.com scraper

Jan 16th, 2016
25
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C# 3.09 KB | None | 0 0
  1. using HtmlAgilityPack;
  2. using System;
  3. using System.Collections.Generic;
  4. using System.IO;
  5. using System.Net;
  6. using System.Text.RegularExpressions;
  7.  
  8. class Program
  9. {
  10.     static void Main()
  11.     {
  12.         Regex r0 = new Regex("(?: |^)(\\p{Ll}+)(?:[ .,?!]|$)", RegexOptions.Compiled);
  13.         IEnumerable<string> ieSsE0 = F0(r0);
  14.         using (FileStream fs0 = File.Open("all_words.txt", FileMode.Create))
  15.         using (StreamWriter sw0 = new StreamWriter(fs0))
  16.         {
  17.             foreach (string s0 in new SortedSet<string>(ieSsE0))
  18.             {
  19.                 Console.WriteLine("Found {0}...", s0);
  20.                 sw0.WriteLine(s0);
  21.             }
  22.         }
  23.         Console.WriteLine("Finished!");
  24.     }
  25.  
  26.     static IEnumerable<string> F0(Regex r0)
  27.     {
  28.         Console.WriteLine("Scanning /list...");
  29.         for (char c0 = 'a'; c0 <= 'z'; c0++)
  30.             foreach (string s0 in F1(c0, r0))
  31.                 yield return s0;
  32.         yield break;
  33.     }
  34.  
  35.     static IEnumerable<string> F1(char c0, Regex r0)
  36.     {
  37.         Console.WriteLine("Scanning /list/{0}...", c0);
  38.         for (int i0 = 1;; i0++)
  39.         {
  40.             IEnumerable<string> ieSsE0;
  41.             try { ieSsE0 = F2(c0, i0, r0); }
  42.             catch { yield break; }
  43.             foreach (string s0 in ieSsE0)
  44.                 yield return s0;
  45.         }
  46.     }
  47.  
  48.     static IEnumerable<string> F2(char c0, int i0, Regex r0)
  49.     {
  50.         Console.WriteLine("Scanning /list/{0}/{1}...", c0, i0);
  51.         HttpWebRequest hwr0 = WebRequest.CreateHttp("http://dictionary.reference.com/list/" + c0 + '/' + i0);
  52.         hwr0.AllowAutoRedirect = false;
  53.         using (HttpWebResponse hwr1 = (HttpWebResponse)hwr0.GetResponse())
  54.         {
  55.             if (hwr1.StatusCode == HttpStatusCode.MovedPermanently)
  56.                 throw new Exception();
  57.             using (Stream s0 = hwr1.GetResponseStream())
  58.             {
  59.                 HtmlDocument hd0 = new HtmlDocument();
  60.                 hd0.Load(s0);
  61.                 foreach (HtmlNode hn0 in hd0.DocumentNode.SelectNodes("/html/body/div[@class='content-container']/div[@class='words-list']/ul/li/span[@class='word']"))
  62.                 {
  63.                     if (r0.IsMatch(hn0.InnerText))
  64.                         yield return hn0.InnerText;
  65.                     foreach (string s1 in F3(hn0.ParentNode.SelectSingleNode("span[@class='definition-link']/a").Attributes["href"].Value, r0))
  66.                         yield return s1;
  67.                 }
  68.             }
  69.         }
  70.         yield break;
  71.     }
  72.  
  73.     static IEnumerable<string> F3(string s0, Regex r0)
  74.     {
  75.         Console.WriteLine("Scanning {0}...", new Uri(s0).AbsolutePath);
  76.         using (WebResponse wr0 = WebRequest.Create(s0).GetResponse())
  77.         using (Stream s1 = wr0.GetResponseStream())
  78.         {
  79.             HtmlDocument hd0 = new HtmlDocument();
  80.             hd0.Load(s1);
  81.             foreach (HtmlNode hn0 in hd0.DocumentNode.SelectNodes("//span"))
  82.                 foreach (Match m0 in r0.Matches(hn0.InnerText))
  83.                     yield return m0.Captures[0].Value;
  84.         }
  85.         yield break;
  86.     }
  87. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement