Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <mysql.h>
- #include <stdio.h>
- #include <stdlib.h>
- #include <time.h>
- #include <unistd.h>
- #include <errno.h>
- #include <string.h>
- #include <netdb.h>
- #include <sys/types.h>
- #include <netinet/in.h>
- #include <sys/socket.h>
- #include <arpa/inet.h>
- #include <ctype.h>
- /* exit error codes */
- #define SUCCESS 0
- #define UNKNOWN_ERROR 1
- #define MYSQL_ERROR 2
- #define HTTP_ERROR 3
- #define MEM_ERROR 4
- /* MySQL credentials */
- #define MYSQL_USER "root"
- #define MYSQL_PASS "fl33021"
- #define MYSQL_HOST "localhost"
- #define MYSQL_DATABASE "havesomeunix"
- #define SQL_CHARS_FORBIDDEN "'\""
- /* HTTP information */
- #define PAGE_SIZE_LIMIT 2048 // max size for web pages are crawler will download, in bytes.
- int url_check(MYSQL *, char *);
- int add_urls(MYSQL *, char *);
- int grep(char *, char *);
- char *get_page(char *, long int); /* TODO: write this function */
- char *sql_escape(char *);
- char *html_tag_remove(char *);
- char *next_url(char **);
- char **get_urls(char *);
- void *get_in_addr(struct sockaddr *);
- int main(void) {
- MYSQL *connection;
- MYSQL_RES *results;
- MYSQL_ROW row;
- char *query;
- int query_buffer;
- long int update_time;
- connection = mysql_init(0);
- if (!mysql_real_connect(connection, MYSQL_HOST,
- MYSQL_USER, MYSQL_PASS, MYSQL_DATABASE, 0, 0, 0)) {
- return MYSQL_ERROR;
- }
- while (1) {
- if (mysql_query(connection, "SELECT url FROM urls WHERE last_checked=(SELECT MIN(last_checked) FROM urls)")) { /* that query is very inefficient and needs to be optimized */
- return MYSQL_ERROR;
- }
- results = mysql_store_result(connection);
- row = mysql_fetch_row(results);
- url_check(connection, row[0]);
- update_time = (long int)time(0);
- query_buffer = update_time + (int)strlen(row[0]);
- if (!(query = malloc(sizeof(char) * (200 + query_buffer))) {
- return MEM_ERROR;
- }
- sprintf(query, "UPDATE urls SET last_checked=%l WHERE url='%s'", update_time, row[0]);
- mysql_query(connection, query);
- free(query);
- mysql_free_result(results);
- }
- mysql_close(connection);
- return SUCCESS;
- }
- char *get_page(char *url) {
- int sockfd;
- int addr_len;
- char *auth_negotiation;
- char *auth_acknowledge;
- char *socks5_conn_req;
- char *socks5_allowance;
- char *IP4_info;
- char *DNS_info;
- char *IP6_info;
- char *HTTP_request;
- char *HTTP_response;
- char *HTTP_web_page;
- addr_len = (int)strlen(url);
- auth_negotiation = malloc(sizeof(int) * 4);
- auth_acknowledge = malloc(sizeof(int) * 3);
- socks5_conn_req = malloc(sizeof(int) * (7 + addr_len));
- socks5_allowance = malloc(sizeof(int) * 4);
- IP4_info = malloc(sizeof(int) * 4);
- DNS_info = malloc(sizeof(int));
- IP6_info = malloc(sizeof(int) * 16);
- HTTP_request = malloc(sizeof(int) * (26 + addr_len));
- HTTP_response =
- strcpy(auth_negotiation "\x05\x01\x00");
- sprintf(socks5_conn_req, "\x05\x01\x00\x03%x%s\x00\x50", addr_len, url);
- sprintf(HTTP_request, "GET / HTTP/1.1\r\nHost: %s\r\n\r\n", url);
- sockfd = socket(AF_INET, SOCK_STREAM, 0);
- connect(sockfd, url, addr_len);
- // Auth negototiation
- send(sockfd, auth_negotiation, (int)strlen(auth_negotiation), 0);
- recv(sockfd, auth_acknowledge, 3, 0);
- // Tell the proxy what we want to connect to
- send(sockfd, socks5_conn_req, 7 + addr_len, 0);
- recv(sockfd, socks5_allowance, 4, 0);
- // local socket address information that we don't care about
- recv(sockfd, IP4_info, 4, 0);
- recv(sockfd, DNS_info, 1, 0);
- recv(sockfd, IP6_info, 16, 0);
- // HTTP request
- send(sockfd, HTTP_request, 26 + addr_len, 0);
- recv(sockfd, HTTP_respons2048
- }
- char *sql_escape(char *query) {
- char *new_query;
- char *forbidden_chars;
- int buffer;
- int is_forbidden;
- buffer = (int)strlen(query) * 2;
- new_query = malloc(sizeof(char) * buffer);
- forbidden_chars = malloc(sizeof(char) * (int)strlen(SQL_CHARS_FORBIDDEN));
- while (*query++) {
- is_forbidden = 0;
- while (*forbidden_chars++) {
- if (*query = *forbidden_chars) {
- is_forbidden = 1;
- break;
- }
- if (is_forbidden) {
- *new_query++ = '\\';
- }
- *new_query++ = *query;
- }
- }
- *new_query = '\0';
- return new_query;
- }
- char *html_tag_remove(char *page) {
- char *fpage;
- int in_tag;
- int buffer;
- buffer = (int)strlen(page);
- fpage = malloc(sizeof(char) * buffer);
- while (*page) {
- if (*page == '<') {
- in_tag = 1;
- }
- if (in_tag) {
- *fpage = *page;
- }
- if (*page == '>') {
- in_tag = 0;
- }
- fpage++;
- page++;
- }
- *fpage = '\0';
- return fpage;
- }
- int insert_urls(MYSQL *connection, char **urls) {
- char *query;
- int query_len;
- while (*urls) {
- query_len = (int)strlen(*urls);
- query = malloc(sizeof(char) * (200 + query_len));
- sprintf(query, "INSERT urls (url, last_checked, linkers, page_content) VALUES ('%s', 0, '', ''", *urls);
- mysql_query(connection, query);
- free(query);
- ++urls;
- }
- return SUCCESS;
- }
- int url_check(MYSQL *connection, char *url) {
- char *raw_page;
- char *web_page;
- char *query;
- if (!(web_page = sql_escape((raw_page = get_page(url, PAGE_LIMIT))))) {
- return HTTP_ERROR;
- }
- if (!(query = malloc(sizeof(char) * (int)strlen(web_page)))) {
- return MEM_ERROR;
- }
- sprintf(query, "UPDATE urls SET page_content='%s' WHERE url='%s'", html_tag_remove(web_page), url);
- mysql_query(connection, query);
- add_urls(connection, raw_page);
- free(raw_page);
- free(query);
- free(web_page);
- return SUCCESS
- }
- int add_urls(MYSQL *connection, char *web_page) {
- char **urls;
- int i;
- if (urls = get_urls(web_page)) {
- return HTTP_ERROR;
- }
- if (insert_urls(connection, urls)) {
- return MYSQL_ERROR;
- }
- return SUCCESS;
- }
- int is_url(char url_char, int period) {
- char *bad_chars = "'\"<> ;,$:\\@#{}()|\t\n\r[]";
- int i;
- if (period) {
- if (url_char == '.') {
- return 0;
- }
- }
- for (i = 0; *(bad_chars + i); i++) {
- if (url_char == *(bad_chars + i)) {
- return 0;
- }
- }
- return 1;
- }
- char *next_url(char **orig_document_stream) {
- char *document_stream;
- char *NULL_domain;
- char *top_level_domain = ".onion";
- char *raw_tld;
- int domain_buffer;
- int tld_count;
- int i;
- int period_found;
- signed int url_pos;
- NULL_domain = malloc(sizeof(char));
- strcpy(NULL_domain, "1");
- document_stream = *orig_document_stream;
- domain_buffer = 50;
- url_pos = grep(document_stream, top_level_domain) - 1;
- if (url_pos == -1) {
- return (char *)-1;
- }
- raw_tld = malloc(sizeof(char) * domain_buffer);
- while (isalnum(*(document_stream + url_pos)) && url_pos > 0) {
- url_pos--;
- }
- if (!isalnum(*(document_stream + url_pos))) {
- url_pos++;
- }
- period_found = 0;
- tld_count = 0;
- for (i = 0; is_url(*(document_stream + url_pos + i), period_found); i++) {
- if (i * 2 > domain_buffer) {
- domain_buffer *= 2;
- raw_tld = realloc(raw_tld, sizeof(char) * domain_buffer);
- }
- if (*(document_stream + url_pos + i) == '.') {
- period_found = 1;
- }
- if (period_found) {
- tld_count++;
- }
- if (tld_count > strlen(top_level_domain)) {
- break;
- }
- *(raw_tld + i) = *(document_stream + url_pos + i);
- }
- *orig_document_stream = (document_stream + url_pos + (int)strlen(raw_tld));
- if (grep(document_stream, top_level_domain) == -1) {
- free(raw_tld);
- return NULL_domain;
- }
- if (!strcmp(raw_tld, top_level_domain)) {
- document_stream = document_stream + strlen(top_level_domain);
- return next_url(&document_stream);
- }
- return raw_tld;
- }
- int grep(char *full_string, char *substring) {
- int i;
- int x;
- int match;
- match = 1;
- for (i = 0; *(full_string + i); i++) {
- for (x = 0; *(substring + x); x++) {
- if (*(full_string + i + x) != *(substring + x)) {
- match = 0;
- break;
- }
- }
- if (match) {
- return i;
- }
- match = 1;
- }
- return -1;
- }
- char **get_urls(char *web_page) {
- char *url;
- char **urls;
- char **orig_urls;
- int buffer;
- int i;
- buffer = 50;
- i = 0;
- urls = malloc(sizeof(char *) * buffer);
- orig_urls = urls;
- while (strlen(url = next_url(&web_page)) != 1) {
- *urls = malloc(sizeof(char) * ((int)strlen(url) + 1));
- strcpy(*urls, url);
- free(url);
- if (((i++) * 2) > buffer) {
- buffer *= 2;
- urls = realloc(urls, sizeof(char *) * buffer);
- }
- ++urls;
- }
- *urls = 0;
- free(url);
- return orig_urls;
- }
Add Comment
Please, Sign In to add comment