Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- <?php
- /*
- 23.04.2023, Archived.Moe's shitty php downloading script
- Just keep the correct download format like /dataPath/{board}/{first 4 digits}/{next 2 digits}/{full chan_image_name}{?thumb?}{.EXT}
- example:
- chan_image_name = 1574977344130.jpg (for the first 10mil or so rows it's a mess, see script bellow)
- board = b
- image is to be saved in /b/image/1574/97/1574977344130.jpg
- thumb is to be saved in /b/thumb/1574/97/1574977344130s.jpg
- */
- /* CONFIGS */
- $data_path = "/var/www/4archive/data/"; // /var/www ...
- $download_success = false; //stupid way of tracking status
- $dbhost = "localhost";
- $dbuser = "";
- $dbpass = "";
- $maindb = "4archive";
- /* CONFIGS */
- // Connect to the database
- $conn = mysqli_connect($dbhost, $dbuser, $dbpass, $maindb);
- // Check connection
- if (!$conn) {
- die("Connection failed: " . mysqli_connect_error());
- }
- /* MAIN */
- main();
- /* MAIN */
- echo "\n \t FINISHED DOWNLOADING IMAGES FROM IMGUR ".PHP_EOL;
- function main()
- {
- global $conn, $data_path, $download_success;
- $count = 0;
- $sqlThreads = " SELECT * FROM threads LIMIT 300000 OFFSET 30000";
- $resultThreads = mysqli_query($conn, $sqlThreads);
- while ($board_row = mysqli_fetch_assoc($resultThreads))
- {
- $board = $board_row['board'];
- $thread_row_id = $board_row['id'];
- // Select data from image_urls table
- $sql = " SELECT id, threads_id, chan_id,original_image_name,chan_image_name,image_url ,downloaded,img_removed
- FROM posts
- WHERE threads_id = $thread_row_id AND
- image_url LIKE '%imgur%' AND (downloaded IS NULL AND img_removed IS NULL) ";
- $result = mysqli_query($conn, $sql);
- // Loop through the data
- while ($row = mysqli_fetch_assoc($result))
- {
- $id = $row['id'];
- $threads_id = $row['threads_id'];
- $original_image_name = $row['original_image_name'];
- $chan_image_name = $row['chan_image_name'];
- $image_url = str_replace( "http:", "https:", $row['image_url']);
- $getUrlExtension = explode('.', $image_url);
- $thumb_url = generateThumbUrl($image_url);
- $downloaded = $row['downloaded'];
- $img_removed = $row['img_removed'];
- //for the first few years of archiving images do not have MD5 value.
- //There will be a lot of duplicate downloads.
- // currently the md5 column isn't indexed. a lookup, even if indexed, would still take a lot of time since it's almost billion posts
- //instead handle the dublicated content issues later - for example when all of the data is saved or when moderating content
- //if whoever ends up using this script or the DB - feel free to index/improve the script.
- //$md5 = $row['md5'];
- // Split the file name into two parts, before and after the dot symbol
- $file_parts = explode('.', $chan_image_name);
- // Check if the first part (before the dot) contains exactly 14 numeric characters, otherwise use the chan_id
- $chan_image_name = (preg_match('/^[0-9]{13}$/', $file_parts[0])) ? $row['chan_image_name'] : $row['chan_id'] . "." . end($getUrlExtension);
- $allowed_extensions = array('jpg', 'png', 'gif');
- $extension = substr($chan_image_name, -3);
- if (strlen($chan_image_name) > 4 && !in_array($extension, $allowed_extensions)) {
- $chan_image_name .= '.' . end($getUrlExtension);
- }
- $thumb_chan_image_name = preg_replace('/[^0-9]/', '', $chan_image_name) . "s.jpg"; //remove any non-numeric characters, append a "s.jpg"
- //stupid way to avoid indexing column 'downloaded' and speeding up the checks
- if ($downloaded != 1 && $img_removed != 1)
- {
- $count ++;
- echo "File count: $count, DB id: $id ".PHP_EOL;
- // // // Query to select board name from threads table
- //this way is more time consuming, no? or just my ssd is shit
- // $sql_board = "SELECT board FROM threads WHERE id='$threads_id'";
- // $result_board = mysqli_query($conn, $sql_board);
- // // Check if the query returned any result
- // if (mysqli_num_rows($result_board) > 0) {
- // $board_row = mysqli_fetch_assoc($result_board);
- // $board = $board_row['board'];
- // } else {
- // echo " \t ??? BOARD NOT FOUND for $sql_board ".PHP_EOL;
- // $board = "tmp";
- // }
- $firstPart = substr($chan_image_name, 0, 4);
- $secondPart = substr($chan_image_name, 4, 3);
- $dirOriginalImage = $data_path . $board ."/image/". $firstPart ."/". $secondPart;
- $dirThumbImage = $data_path . $board ."/thumb/". $firstPart ."/". $secondPart;
- $pathWithOriginalImage = $dirOriginalImage . "/". $chan_image_name;
- $pathWithThumbImage = $dirThumbImage . "/". $thumb_chan_image_name;
- // //or to download only thumbnails -
- // if( !file_exists($old_thumb_url) )
- // only continue if file is not found on disk
- if( !file_exists($pathWithOriginalImage) )
- {
- // Visit image_url and check for redirection
- $ch = curl_init();
- curl_setopt($ch, CURLOPT_URL, $image_url);
- curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
- curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
- curl_exec($ch);
- $redirect_url = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
- curl_close($ch);
- if (strpos($redirect_url, "removed.png") !== false) {
- echo " File $image_url was removed ".PHP_EOL;
- // File is removed, update removed column in posts table
- $sql_update_removed = "UPDATE posts SET img_removed = '1' WHERE id = '$id' ";
- mysqli_query($conn, $sql_update_removed);
- } else {
- // File is not removed, download it and verify
- //save big file only if it's not a webm, otherwise save only thumbnail
- if (substr($original_image_name, -4) === 'webm')
- {
- //echo "\t\t\t WEBM! ";
- saveFile($dirThumbImage, $pathWithThumbImage, $thumb_url);
- }else{
- //saveFile($dirOriginalImage, $pathWithOriginalImage, $image_url); //save big image
- saveFile($dirThumbImage, $pathWithThumbImage, $thumb_url); // save thumb
- }
- if ($download_success == true) {
- // File is downloaded, update downloaded column in posts table
- $sql_update_downloaded = "UPDATE posts SET downloaded = '1' WHERE id = '$id' " ;
- //mysqli_query($conn, $sql_update_downloaded);
- } else {
- // File download failed
- echo "Failed to download file: {$image_url}".PHP_EOL;
- sleep(60);
- }
- }
- }else{
- echo "File exists - $pathWithOriginalImage ".PHP_EOL;
- $sql_update_downloaded = "UPDATE posts SET downloaded = '1' WHERE id = '$id' " ;
- mysqli_query($conn, $sql_update_downloaded);
- }
- }
- }
- }
- // Close the database connection
- mysqli_close($conn);
- }
- function saveFile($dir, $imageWithPath, $fileLink)
- {
- global $download_success;
- //echo "Saving $fileLink to $imageWithPath ".PHP_EOL;
- //check first if file exists
- if( file_exists($imageWithPath) )
- {
- echo "File exists - $imageWithPath ".PHP_EOL;
- return $download_success = true;
- }
- // check if directory exists, create it otherwise.
- if (!file_exists($dir)) {
- mkdir($dir, 0700, true);
- chmod($dir, 0700);
- // chown($dir, 'www-data');
- // chgrp($dir, 'www-data');
- }
- $ch = curl_init();
- curl_setopt($ch, CURLOPT_URL, $fileLink);
- curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
- curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
- $fp = fopen($imageWithPath, 'w');
- curl_setopt($ch, CURLOPT_FILE, $fp);
- $result = curl_exec($ch);
- curl_close($ch);
- fclose($fp);
- if ($result !== false) {
- if( file_exists($imageWithPath) )
- {
- echo "File $fileLink downloaded successfully to $imageWithPath" . PHP_EOL;
- chmod($imageWithPath, '775'); //so that we can delete the file via web interface?
- // chown($imageWithPath, 'www-data'); //otherwise nginx won't see it?
- // chgrp($imageWithPath, 'www-data');
- return $download_success = true;
- }
- else{
- echo " !!ERROR: file was not downloaded - $fileLink was not saved into $imageWithPath !!!!" .PHP_EOL;
- return $download_success = false;
- }
- } else {
- echo " !!ERROR writing to local file - $fileLink into $imageWithPath " . PHP_EOL;
- $download_success = false;
- return $download_success = false;
- }
- }
- //via file_put_contents - maybe slightly slower?
- function saveFile_old($dir, $imageWithPath, $fileLink)
- {
- global $download_success;
- //echo "Saving $fileLink to $imageWithPath ".PHP_EOL;
- //check first if file exists
- if( file_exists($imageWithPath) )
- {
- echo "File exists - $imageWithPath ".PHP_EOL;
- return $download_success = true;
- }
- // check if directory exists, create it otherwise.
- if (!file_exists($dir)) {
- mkdir($dir, 0700, true);
- chmod($dir, 0700);
- // chown($dir, 'www-data');
- // chgrp($dir, 'www-data');
- }
- $fp = fopen($fileLink, 'r'); // Open the remote file for reading
- if ($fp !== false) { // Check if the file was opened successfully
- $bytesWritten = file_put_contents($imageWithPath, $fp); // Write the contents of the remote file to the local file
- fclose($fp); // Close the remote file handle
- // Check if the local file was written successfully
- if ($bytesWritten !== false)
- {
- if( file_exists($imageWithPath) )
- {
- //echo "File $fileLink downloaded successfully to $imageWithPath" . PHP_EOL;
- chmod($imageWithPath, '775'); //so that we can delete the file via web interface?
- // chown($imageWithPath, 'www-data'); //otherwise nginx won't see it?
- // chgrp($imageWithPath, 'www-data');
- return $download_success = true;
- }
- else{
- echo " !!ERROR: file was not downloaded - $fileLink was not saved into $imageWithPath !!!!" .PHP_EOL;
- return $download_success = false;
- }
- } else {
- echo " !!ERROR writing to local file - $fileLink into $imageWithPath " . PHP_EOL;
- $download_success = false;
- return $download_success = false;
- }
- } else {
- echo " !!ERROR opening remote file - $fileLink " . PHP_EOL;
- $download_success = false;
- return $download_success = false;
- }
- }
- function generateThumbUrl($image_url)
- {
- // Find the position of the last dot in the URL
- $lastDotPos = strrpos($image_url, '.');
- if (strpos($image_url, 'webm') == true ) {
- //"The string contains 'webm'.";
- $thumb_url = $image_url;
- } else {
- //"The string does not contain 'webm'.";
- // Insert an "m" before the last dot
- if ($lastDotPos !== false)
- $thumb_url = substr_replace($image_url, 'm', $lastDotPos, 0);
- else
- $thumb_url = $image_url;
- }
- return $thumb_url;
- }
- ?>
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement