Advertisement
nycionx

PHP Scrapping - How To [EXAMPLE]

Jun 28th, 2013
210
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
PHP 4.88 KB | None | 0 0
  1. <?php
  2. date_default_timezone_set('Europe/Belgrade');
  3. header('Content-Type: text/plain; charset=utf-8');
  4.  
  5. // [PART 1 - Making the database]
  6. // a) Establishing a MySQL database connection; b) creating a database; c) selecting the database; and d) creating a table
  7.  
  8. // MySQL connection parameters
  9. $hostname = "localhost";
  10. $username = "blabla";
  11. $password = "blabla";
  12. $database = "mysql"; // establish connection to an already existing database
  13.  
  14. // Create a database handle/connection. NB: mysqli API is used throughout so make sure u never have mysql show up or shit will get fucked!
  15. $dbh = mysqli_connect($hostname, $username, $password, $database)
  16.     or die("Unable to connect to MySQL");
  17. echo "Connected to MySQL"; echo "\n";
  18.  
  19. // Create database and set encodings
  20. $sql="CREATE DATABASE events DEFAULT CHARACTER SET utf8 DEFAULT COLLATE utf8_general_ci;";
  21. if (mysqli_query($dbh, $sql))
  22. {
  23.   echo "Database events created successfully"; echo "\n";
  24. }
  25. else
  26. {
  27.   echo "Error creating database: " . mysqli_error($dbh); echo "\n";
  28. }
  29.  
  30. // Select the database just created
  31. $db_selected = mysqli_select_db($dbh, "events");
  32.  
  33. // Create table. make sure your varchars are long enough or data wont get entered and you'll be left clueless as to why. i had Name VARCHAR(30) at start and it was too small, then i made it like Name VARCHAR(200) and it worked, data got entered. i used phpmyadmin to figure this out, entering the values in manually, great for debugging as you see what errors come up. anyway, set all your varchars high enough and then adjust later once you see how much they really take up
  34.  
  35. $sql="CREATE TABLE eventlist(PID INT NOT NULL AUTO_INCREMENT, PRIMARY KEY(PID), Name VARCHAR(200), Day INT, Month VARCHAR(20), Until VARCHAR(300), Category VARCHAR(30), Info VARCHAR(200))";
  36. if (mysqli_query($dbh, $sql))
  37. {
  38.   echo "Table eventlist created successfully"; echo "\n\n";
  39. }
  40. else
  41. {
  42.   echo "Error creating table: " . mysqli_error($dbh); echo "\n\n";
  43. }
  44.  
  45. // [PART 2 - PHP Scrapping]
  46. // a) grabbing the source code of the target site; b) separating what parts we need (the values/data) from what we dont using regexes; c) storing the data in a multi-dimensional array; d) converting it to sql format; and e) inserting it into the database
  47.  
  48. // store the source code of the site you want to scrape in a variable
  49. $contents = file_get_contents('http://www.tob.rs/en/events_list.php');
  50.  
  51. // define a regular expression where you want to split the source code (where the information is useful to you)
  52. $regexp = '/<div class="list\\_articles/';
  53.  
  54. // split the source code into pieces and store them in an array
  55. $records = preg_split($regexp, $contents);
  56.  
  57. // remove index 0 from the array because it doesnt contain what u want (its all the stuff in the code before the regex you defined above ($regexp))
  58. $records = array_slice($records, 1);
  59.  
  60. // create arrays for the upcoming steps, you will see their use there
  61. $name = array(); $day = array(); $month = array(); $until = array(); $category = array(); $info = array(); $all = array(); $line = array();
  62.  
  63. // most imporant part; defining your regexes to extract the parts you want to go into the database; use regexbuddy and/or txt2re.com to help you create these
  64. for ($ix=0; $ix < count($records); $ix++)
  65. {
  66.     $tmp = $records[$ix];
  67.  
  68.     preg_match('/events\\.php\\?id=[0-9][0-9][0-9]">(.*?)</', $tmp, $match_name);
  69.     preg_match("/<p class='day'>(.*?)</", $tmp, $match_day);
  70.     preg_match("/<p class='mon'>(.*?)</", $tmp, $match_month);
  71.     preg_match("/>[\s]*(.*?)<a/", $tmp, $match_until);
  72.     preg_match('/events_list\\.php\\?t=[0-9]">(.*?)</', $tmp, $match_cat);
  73.     preg_match('/<p>(.*?)</', $tmp, $match_info);
  74.    
  75.     array_push($name, $match_name[1]);
  76.     array_push($day, $match_day[1]);
  77.     array_push($month, $match_month[1]);
  78.     array_push($until, $match_until[1]);
  79.     array_push($category, $match_cat[1]);
  80.     array_push($info, $match_info[1]);
  81.    
  82.     // setting up the values in a 2-dimensional array. later it will be converted to sql format using the implode function
  83.     $line = array("Name"=>$name[$ix], "Day"=>$day[$ix], "Month"=>$month[$ix], "Until"=>$until[$ix], "Category"=>$category[$ix], "Info"=>$info[$ix]);
  84.     array_push($all, $line);
  85.  
  86. }
  87.  
  88. // this is just to see what your 2-d array looks like. you can comment it out later
  89. print_r($all);
  90.  
  91. // now we will convert the 2-d array to sql format
  92. $sql = array();
  93. foreach($all as $row)
  94. {
  95.         $sql[] = '("'.mysqli_real_escape_string($dbh, $row['Name']).'", '.$row['Day'].', "'.mysqli_real_escape_string($dbh, $row['Month']).'", "'.mysqli_real_escape_string($dbh, $row['Until']).'", "'.mysqli_real_escape_string($dbh, $row['Category']).'", "'.mysqli_real_escape_string($dbh, $row['Info']).'")';
  96. }
  97.  
  98. $conv = "INSERT INTO `events`.`eventlist` (Name, Day, Month, Until, Category, Info) VALUES ".implode(',', $sql);
  99.  
  100. // where the magic happens; data getting inserted into the database
  101. mysqli_query($dbh, $conv);
  102.  
  103. // finito!
  104. mysqli_close($dbh);
  105.  
  106. ?>
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement