Advertisement
Quazaka

Untitled

Sep 16th, 2016
73
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C# 1.61 KB | None | 0 0
  1. using System;
  2. using System.Collections.Generic;
  3. using System.Linq;
  4. using System.Text;
  5. using System.IO;
  6. using System.Net;
  7. using System.Text.RegularExpressions;
  8.  
  9. namespace ParsingRobotTxt
  10. {
  11. class Program
  12. {
  13. public static HttpWebRequest req;
  14. public static HttpWebResponse res;
  15. static Stream resStream;
  16.  
  17. static void Main(string[] args)
  18. {
  19. String baseUrl = "http://www.cnn.com/";
  20. baseUrl += "/robots.txt";
  21.  
  22. getDisallowedUrls(baseUrl);
  23. }
  24.  
  25. static void getDisallowedUrls(string baseUrl)
  26. {
  27. if (isValidUrl(baseUrl))
  28. {
  29. urlOpen();
  30. }
  31.  
  32. String RobotTxtContent = read();
  33.  
  34. List disallowed = new List(); // List that holds Urls which shouldn't be crawled
  35. String[] user_agents = Regex.Split(RobotTxtContent, "User-agent:");
  36. String userAgents = "";
  37. foreach (String agent in user_agents)
  38. {
  39. if (agent.Trim().StartsWith("*"))
  40. {
  41. userAgents = agent.Trim().Substring(1);
  42. }
  43. }
  44.  
  45. String[] disallow = Regex.Split(userAgents, "Disallow:");
  46.  
  47. foreach (String item in disallow)
  48. {
  49. if (item != "\n")
  50. {
  51. disallowed.Add(item.Trim());
  52. Console.WriteLine(baseUrl + item.Trim());
  53. }
  54. }
  55.  
  56. Console.ReadLine();
  57.  
  58. }
  59.  
  60. public static String read()
  61. {
  62. StreamReader sr = new StreamReader(resStream);
  63. String strText = sr.ReadToEnd();
  64. return strText;
  65. }
  66.  
  67. public static void urlOpen()
  68. {
  69. resStream = res.GetResponseStream();
  70. }
  71.  
  72. public static bool isValidUrl(String url)
  73. {
  74. try
  75. {
  76. req = (HttpWebRequest)HttpWebRequest.Create(url);
  77. res = (HttpWebResponse)req.GetResponse();
  78. return (res.StatusCode == HttpStatusCode.OK);
  79. }
  80. catch (Exception ex)
  81. {
  82. Console.WriteLine("Not a Valid URL:" + ex.Message + " - " + url);
  83. return false;
  84. }
  85. }
  86. }
  87. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement