Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- using System;
- using System.Collections.Generic;
- using System.Linq;
- using System.Text;
- using System.IO;
- using System.Net;
- using System.Text.RegularExpressions;
- namespace ParsingRobotTxt
- {
- class Program
- {
- public static HttpWebRequest req;
- public static HttpWebResponse res;
- static Stream resStream;
- static void Main(string[] args)
- {
- String baseUrl = "http://www.cnn.com/";
- baseUrl += "/robots.txt";
- getDisallowedUrls(baseUrl);
- }
- static void getDisallowedUrls(string baseUrl)
- {
- if (isValidUrl(baseUrl))
- {
- urlOpen();
- }
- String RobotTxtContent = read();
- List disallowed = new List(); // List that holds Urls which shouldn't be crawled
- String[] user_agents = Regex.Split(RobotTxtContent, "User-agent:");
- String userAgents = "";
- foreach (String agent in user_agents)
- {
- if (agent.Trim().StartsWith("*"))
- {
- userAgents = agent.Trim().Substring(1);
- }
- }
- String[] disallow = Regex.Split(userAgents, "Disallow:");
- foreach (String item in disallow)
- {
- if (item != "\n")
- {
- disallowed.Add(item.Trim());
- Console.WriteLine(baseUrl + item.Trim());
- }
- }
- Console.ReadLine();
- }
- public static String read()
- {
- StreamReader sr = new StreamReader(resStream);
- String strText = sr.ReadToEnd();
- return strText;
- }
- public static void urlOpen()
- {
- resStream = res.GetResponseStream();
- }
- public static bool isValidUrl(String url)
- {
- try
- {
- req = (HttpWebRequest)HttpWebRequest.Create(url);
- res = (HttpWebResponse)req.GetResponse();
- return (res.StatusCode == HttpStatusCode.OK);
- }
- catch (Exception ex)
- {
- Console.WriteLine("Not a Valid URL:" + ex.Message + " - " + url);
- return false;
- }
- }
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement