Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- using System;
- using System.Collections.Generic;
- using System.Linq;
- using System.Net;
- using System.Text;
- using System.Threading.Tasks;
- namespace Scraper
- {
- public enum Protocol : byte
- {
- UNKNOWN,
- HTTP,
- HTTPS,
- FTP,
- FILE,
- SFTP,
- };
- public struct Url
- {
- public Protocol protocol;
- public string domain;
- public string path;
- public UInt16 port;
- public bool valid;
- }
- public struct ProtocolStringMatch
- {
- public Protocol protocol;
- public string protocol_string;
- }
- class UrlHandler
- {
- private static List<ProtocolStringMatch> protocols = new List<ProtocolStringMatch>();
- private static List<Url> urls = new List<Url>(Int32.MaxValue);
- private static List<string> domainBlacklist = new List<string>();
- public static void PushProtocol(Protocol protocol, string match)
- {
- protocols.Add(new ProtocolStringMatch()
- {
- protocol = protocol,
- protocol_string = match
- });
- }
- public static Protocol GetUrlProtocol(string url)
- {
- foreach (ProtocolStringMatch psm in protocols)
- {
- if (url.StartsWith(psm.protocol_string))
- {
- return psm.protocol;
- }
- }
- return Protocol.UNKNOWN;
- }
- public static string GetProtocolMatch(Protocol protocol)
- {
- foreach (ProtocolStringMatch psm in protocols)
- {
- if (protocol == psm.protocol)
- {
- return psm.protocol_string;
- }
- }
- return null;
- }
- public static void PushBlacklistDomain(string domain)
- {
- domainBlacklist.Add(domain);
- }
- public static void PushUrl(Url url)
- {
- foreach (string domain in domainBlacklist)
- {
- if(url.domain == domain)
- {
- return;
- }
- }
- urls.Add(url);
- }
- public static Url ParseUrl(string currentConnection, string new_path)
- {
- Url ret = default(Url);
- // Let's just stick with 80, we can get ports from the url later.
- ret.port = 80;
- if (new_path[0] == '?')
- {
- //
- // Parsing a file like "?ajax=1&download=all"
- //
- // TODO: Parse query redirect
- ret.valid = false;
- }
- else if (new_path[0] == '#')
- {
- //
- // Parsing url as such "#lol"
- //
- // This is only useful with human interaction. So let's skip it.
- ret.valid = false;
- }
- else if (new_path[0] == '/')
- {
- //
- // Parsing a path like "/path/index.php?lol"
- //
- // Getting the protocol
- Protocol prot = GetUrlProtocol(currentConnection);
- // If the protocol doesn't equal to unknown, continue.
- if(prot != Protocol.UNKNOWN)
- {
- // Setting the return protocol
- ret.protocol = prot;
- // Setting the doamin, subtracting the protocol.
- ret.domain = currentConnection.Substring(GetProtocolMatch(prot).Length);
- // Getting the path position in the url
- int pathPosition = ret.domain.IndexOf('/');
- // Checking it the path index was found
- if(pathPosition != -1)
- {
- // Path was found, so let's remote the path from the domain.
- ret.domain = ret.domain.Substring(0, pathPosition);
- // Setting the path, as the path.
- ret.path = new_path;
- }
- // So far the domain is valid
- ret.valid = true;
- }
- else
- {
- ret.valid = false;
- }
- }
- else if ((ret.protocol = GetUrlProtocol(new_path)) != Protocol.UNKNOWN)
- {
- //
- // Parsing path like "http://google.com/drinkbleach?dankmeme"
- //
- // Getting the start of the domain.
- ret.domain = currentConnection.Substring(GetProtocolMatch(ret.protocol).Length);
- // Getting the path position in the url
- int pathPosition = ret.domain.IndexOf('/');
- // Checking it the path index was found
- if (pathPosition != -1)
- {
- ret.path = ret.domain.Substring(pathPosition);
- // Path was found, so let's remote the path from the domain.
- ret.domain = ret.domain.Substring(0, pathPosition);
- }
- // So far the domain is valid
- ret.valid = true;
- }
- else
- {
- //
- // Parsing a domain like "google.com/blahclhfgdfg?fghghjhj"
- //
- // Getting the start of the domain.
- ret.domain = new_path;
- // Getting the path position in the url
- int pathPosition = ret.domain.IndexOf('/');
- // Checking it the path index was found
- if (pathPosition != -1)
- {
- ret.path = ret.domain.Substring(pathPosition);
- // Path was found, so let's remote the path from the domain.
- ret.domain = ret.domain.Substring(0, pathPosition);
- }
- // So far the domain is valid
- ret.valid = true;
- }
- if (!IsValidDomain(ret.domain))
- {
- // Unable to resolve the domain, so it's invalid.
- ret.valid = false;
- }
- return ret;
- }
- public static bool IsValidDomain(this string domainName)
- {
- IPAddress[] ips = Dns.GetHostAddresses(domainName);
- if (ips.Length == 0)
- return false;
- return true;
- }
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement